I am currently trying to solve a bug but believe the data I am working with may be too complex and cause errors that shouldn't normally occur. I've written a function, and was hoping to add a try or tryCatch statement to skip the error if it occurs. I currently have:
library(glmnet)
foo <- function(data, ols_ps = TRUE, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
if (ols_ps == TRUE) {
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x)) & ncol(new_x) > 0) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
} else {
coef <- coef
}
} else {
coef <- coef
}
return(coef)
}
which normally works and works on most datasets. I think the error may be coming from a complex dataset. Is it possible to skip OLS if I get the below error?
"Error in x[, coef_nonzero, drop = FALSE] : \n (subscript) logical subscript too long\n"
attr(,"class")
Here is a minimal working example per request.
set.seed(123)
matrix <- matrix(runif(1000), ncol=10)
boot(matrix,foo,R=50)
Thanks in advance.
Maybe like this?
foo <- function(data, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x))) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
}
return(coef)
}
The problem is that we have no case when it fails so far.
I have the following script
Posdef <- function (n, ev = runif(n, 0, 10))
{
Z <- matrix(ncol=n, rnorm(n^2))
decomp <- qr(Z)
Q <- qr.Q(decomp)
R <- qr.R(decomp)
d <- diag(R)
ph <- d / abs(d)
O <- Q %*% diag(ph)
Z <- t(O) %*% diag(ev) %*% O
return(Z)
}
Sigma <- Posdef(n = 11)
mu <- runif(11,0,10)
data <- as.data.frame(mvrnorm(n=1000, mu, Sigma))
data[data < 0] <- 0 #setting a floor#
data[data > 10] <- 10 #setting a ceiling#
names(data) = c('criteria_1', 'criteria_2', 'criteria_3', 'criteria_4', 'criteria_5',
'criteria_6', 'criteria_7', 'criteria_8', 'criteria_9', 'criteria_10',
'outcome')
data$outcome <- ifelse(data$outcome > 5, 1, 0)
data <- data[, sapply(data, is.numeric)]
maxValue <- as.numeric(apply (data, 2, max))
minValue <- as.numeric(apply (data, 2, min))
data_scaled <- as.data.frame(scale(data, center = minValue,
scale = maxValue-minValue))
ind <- sample (1:nrow(data_scaled), 600)
train <- data_scaled[ind,]
test <- data_scaled[-ind,]
model <- glm (formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
family = "binomial",
data = train)
summary (model)
predicted_model <- predict(model, test)
neural_model <- neuralnet(formula =
outcome ~ criteria_1 + criteria_2 + criteria_3 + criteria_4 + criteria_5 +
criteria_6 + criteria_7 + criteria_8 + criteria_9 + criteria_10,
hidden = c(2,2) ,
threshold = 0.01,
stepmax = 1e+07,
startweights = NULL,
rep = 1,
learningrate = NULL,
algorithm = "rprop+",
linear.output=FALSE,
data= train)
plot (neural_model)
results <- compute (neural_model, test[1:10])
results <- results$net.result*(max(data$outcome)-
min(data$outcome))+ min(data$outcome)
Values <- (test$outcome)*(max(data$outcome)-
min(data$outcome)) + min(data$outcome)
MSE_nueral_model <- sum((results - Values)^2)/nrow(test)
MSE_model <- sum((predicted_model - test$outcome)^2)/nrow(test)
print(MSE_model - MSE_nueral_model)
R1 <- (MSE_model - MSE_nueral_model)
The purpose of this script is to generate some arbitrary multivariate distribution and then compare two methods. In this case its a neural net and logistic regression. The end result is a difference in mean square error.
Now my issue with creating a loop has been with generating the 1000 observations.
I am able to create a loop without the data simulation portion of the script, putting that into the loop seems to make things go haywire. I tried creating a column vector filled with NA's but all I ended up getting was a single value returned rather than a vector of length n populated by the MSE reductions for each iteration of the loop.
Any help would be greatly appreciated.
I'm having troubles reimplementing a model from winbugs on rjags. I'm getting the Invalid parent values error which is the error you get when censoring was not correctly setup, but I can't see my mistake.
This is the original model on WinBugs:
model {
for(i in 1 : N) {
times[i] ~ dweib(v, lambda[i]) T(censor[i],)
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}
Setting up a reproducible example:
type <- as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))
censor <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,882,892,1031,
1033,1306,1335,0,1452,1472,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,381,0,0,0,0,0,0,0,0,0,529,0,
0,0,0,0,0,0,0,0,945,0,0,1180,0,0,1277,1397,1512,1519)
times <-c (17,42,44,48,60,72,74,95,103,108,122,144,167,170,183,185,193,195,197,208,234,235,254,307,315,401,
445,464,484,528,542,567,577,580,795,855,NA,NA,NA,NA,NA,NA,1366,NA,NA,1,63,105,129,182,216,250,262,
301,301,342,354,356,358,380,NA,383,383,388,394,408,460,489,499,524,NA,535,562,675,676,748,748,778,
786,797,NA,955,968,NA,1245,1271,NA,NA,NA,NA)
df <- tibble(type = type, censor = censor, time = times) %>%
mutate(censor_limit = replace(censor, censor == 0, max(times, na.rm = TRUE))) %>%
mutate(is_censored = ifelse(is.na(time), 1, 0)) %>%
mutate(time_init = ifelse(is_censored == 1, censor_limit + 1, NA))
df$censor <- NULL
head(df)
And this is the rjags part:
m <- textConnection("model {
for(i in 1 : N) {
isCensored[i] ~ dinterval(times[i], censorLimit[i])
times[i] ~ dweib(v, lambda[i])
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
# Median survival time
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}")
d <- list(N = nrow(df), times = df$time, type = df$type, isCensored = df$is_censored,
censorLimit = df$censor_limit)
inits1 = function() {
inits = list(v = 1, beta0 = 0, beta1=0, times = df$time_init)
}
mod <- jags.model(m, data = d, inits = inits1, n.chains = 3)
update(mod, 1e3)
mod_sim <- coda.samples(model = mod, variable.names = c("lambda", "median0", "median1"), n.iter = 5e3)
mod_csim <- as.mcmc(do.call(rbind, mod_sim))
Output:
Compiling model graph
Resolving undeclared variables
Allocating nodes
Graph information:
Observed stochastic nodes: 164
Unobserved stochastic nodes: 19
Total graph size: 910
Initializing model
Deleting model
Error in jags.model(m, data = d, inits = inits1, n.chains = 3): Error in node h[35]
Invalid parent values
The data I'm dealing with occasionally has a "perfectly fitting" linear model. For each regression I run, I need to extract the r.squared value which I've been doing with summary(mymodel)$r.squared but this fails in the case of a perfectly fitting model (see below).
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
0.5294
How can I handle these cases? Basically, I think I want to do something like
If(mymodel is a perfect fit)
rsquared = 1
else
rsquared = summary(mymodel)$r.squared
You can use tryCatch
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
tryCatch(summary(mymodel)$r.squared, warning = function(w) return(1))
# [1] 1
And with an added conditional to catch specific warnings
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
summary(mymodel)$r.squared #This raises a warning
f <- function(expr) {
tryCatch(expr,
warning = function(w) {
if (grepl('perfect fit', w))
return(1)
else return(w)
})
}
f(TRUE)
# [1] TRUE
f(sum(1:5))
# [1] 15
f(summary(mymodel)$r.squared)
# [1] 1
f(warning('this is not a fit warning'))
# <simpleWarning in doTryCatch(return(expr), name, parentenv, handler): this is not a fit warning>
If you want to make sure that everything will be working perfect then you can just slightly modify the source code (type summary.lm to see the original code):
df <- data.frame(x = c(1,2,3,4,5), y = c(1,1,1,1,1))
mymodel <- lm(y ~ x, data = df)
This is how i modified it. All is the same as the original summary function apart from the bit at the bottom of the function.
summary2 <- function (object, correlation = FALSE, symbolic.cor = FALSE,
...)
{
z <- object
p <- z$rank
rdf <- z$df.residual
if (p == 0) {
r <- z$residuals
n <- length(r)
w <- z$weights
if (is.null(w)) {
rss <- sum(r^2)
}
else {
rss <- sum(w * r^2)
r <- sqrt(w) * r
}
resvar <- rss/rdf
ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
class(ans) <- "summary.lm"
ans$aliased <- is.na(coef(object))
ans$residuals <- r
ans$df <- c(0L, n, length(ans$aliased))
ans$coefficients <- matrix(NA, 0L, 4L)
dimnames(ans$coefficients) <- list(NULL, c("Estimate",
"Std. Error", "t value", "Pr(>|t|)"))
ans$sigma <- sqrt(resvar)
ans$r.squared <- ans$adj.r.squared <- 0
return(ans)
}
if (is.null(z$terms))
stop("invalid 'lm' object: no 'terms' component")
if (!inherits(object, "lm"))
warning("calling summary.lm(<fake-lm-object>) ...")
Qr <- qr(object)
n <- NROW(Qr$qr)
if (is.na(z$df.residual) || n - p != z$df.residual)
warning("residual degrees of freedom in object suggest this is not an \"lm\" fit")
r <- z$residuals
f <- z$fitted.values
w <- z$weights
if (is.null(w)) {
mss <- if (attr(z$terms, "intercept"))
sum((f - mean(f))^2)
else sum(f^2)
rss <- sum(r^2)
}
else {
mss <- if (attr(z$terms, "intercept")) {
m <- sum(w * f/sum(w))
sum(w * (f - m)^2)
}
else sum(w * f^2)
rss <- sum(w * r^2)
r <- sqrt(w) * r
}
resvar <- rss/rdf
p1 <- 1L:p
R <- chol2inv(Qr$qr[p1, p1, drop = FALSE])
se <- sqrt(diag(R) * resvar)
est <- z$coefficients[Qr$pivot[p1]]
tval <- est/se
ans <- z[c("call", "terms", if (!is.null(z$weights)) "weights")]
ans$residuals <- r
ans$coefficients <- cbind(est, se, tval, 2 * pt(abs(tval),
rdf, lower.tail = FALSE))
dimnames(ans$coefficients) <- list(names(z$coefficients)[Qr$pivot[p1]],
c("Estimate", "Std. Error", "t value", "Pr(>|t|)"))
ans$aliased <- is.na(coef(object))
ans$sigma <- sqrt(resvar)
ans$df <- c(p, rdf, NCOL(Qr$qr))
if (p != attr(z$terms, "intercept")) {
df.int <- if (attr(z$terms, "intercept"))
1L
else 0L
ans$r.squared <- mss/(mss + rss)
ans$adj.r.squared <- 1 - (1 - ans$r.squared) * ((n -
df.int)/rdf)
ans$fstatistic <- c(value = (mss/(p - df.int))/resvar,
numdf = p - df.int, dendf = rdf)
}
else ans$r.squared <- ans$adj.r.squared <- 0
ans$cov.unscaled <- R
dimnames(ans$cov.unscaled) <- dimnames(ans$coefficients)[c(1,
1)]
#below is the only change to the code
#instead of ans$r.squared <- 1 the original code had a warning
if (is.finite(resvar) && resvar < (mean(f)^2 + var(f)) *
1e-30) {
ans$r.squared <- 1 #this is practically the only change in the source code. Originally it had the warning here
}
#moved the above lower in the order of the code so as not to affect the original code
#checked it and seems to be working properly
if (correlation) {
ans$correlation <- (R * resvar)/outer(se, se)
dimnames(ans$correlation) <- dimnames(ans$cov.unscaled)
ans$symbolic.cor <- symbolic.cor
}
if (!is.null(z$na.action))
ans$na.action <- z$na.action
class(ans) <- "summary.lm"
ans
}
Run the new formula and see that it works now without any warnings. No other if or else if conditions are required.
> summary2(mymodel)$r.squared
[1] 1
One option to catch a perfect fit is to determine the residuals: if it is a perfect fit, the sum of residuals will be zero.
x = 1:5
# generate 3 sets of y values, last set is random values
y = matrix(data = c(rep(1,5),1:5,rnorm(5)), nrow = 5)
tolerance = 0.0001
r.sq = array(NA,ncol(y))
# check fit for three sets
for (i in 1:ncol(y)){
fit = lm(y[,i]~x)
# determine sum of residuals
if (sum(abs(resid(fit))) < tolerance) {
# perfect fit case
r.sq[i] = 1 } else {
# non-perfect fit case
r.sq[i] = summary(fit)$r.squared
}
}
print(r.sq)
# [1] 1.0000000 1.0000000 0.7638879