I am using recursive feature elimination from the R package 'caret'
Linear regression works fine for my problem, therfore I am using functions = lmFuncs insinde my control function.
But I would like to test this setup again without an intercept, is this possible?
My current code:
control <- rfeControl(functions = lmFuncs
, verbose = FALSE
)
results <- rfe(df_train
, df_train
, rfeControl=control
)
I would also go for a custom function, but I do not know how.
Many thanks in advance.
Edit:
I found the answer after having a deeper look into the caret package.
lmFuncs without Intercept:
lmFuncs_wo_intercept <- list(
summary = defaultSummary,
fit = function(x, y, first, last, ...) {
tmp <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
#lm(y~., data = tmp) #old
lm(y~0+., data = tmp) #new
},
pred = function(object, x) {
if(!is.data.frame(x)) x <- as.data.frame(x, stringsAsFactors = TRUE)
predict(object, x)
},
rank = function(object, x, y) {
coefs <- abs(coef(object))
#coefs <- coefs[names(coefs) != "(Intercept)"] # old
coefs[is.na(coefs)] <- 0
vimp <- data.frame(Overall = unname(coefs),
var = names(coefs))
rownames(vimp) <- names(coefs)
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars
)
lmFuncs is your linear regression? I this case you could try to fit a second linear regression without the intercept and then apply the feature elimination function
Related
library(GLMsData)
data(fluoro)
lambda <- seq(-2,2,0.5)
lm.out <- list()
for(i in length(lambda)){
if(i != 0){
y <- (fluoro$Dose^lambda-1)/lambda
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y[i]~Time, data = fluoro, na.exclude = T)
}
print(lm.out)
Error in model.frame.default(formula = y[i] ~ Time, data = fluoro, drop.unused.levels = TRUE) : variable lengths differ (found for 'Time')
I am trying to use various transformations of the response variable and fit these corresponding models, and obtain residual plots for each model.
I need a help. Thanks
Here is a corrected version of the for loop in the question.
data(fluoro, package = "GLMsData")
lambda <- seq(-2, 2, 0.5)
lm.out <- list()
for(i in 1:length(lambda)){
if(lambda[i] != 0){
y <- (fluoro$Dose^lambda[i]-1)/lambda[i]
} else {
y <- log(fluoro$Dose)
}
lm.out[[i]] <- lm(y ~ Time, data = fluoro, na.action = na.exclude)
}
print(lm.out)
And a version with a boxcox function defined and used in a lapply loop.
boxcox <- function(x, lambda, na.rm = FALSE){
if(na.rm) x <- x[!is.na(x)]
if(lambda == 0){
log(x)
} else {
(x^lambda - 1)/lambda
}
}
lm_out2 <- lapply(lambda, \(l){
lm(boxcox(Dose, lambda = l) ~ Time, data = fluoro, na.action = na.exclude)
})
Check that both ways above produce the same results.
coef_list <- sapply(lm.out, coef)
coef_list2 <- sapply(lm_out2, coef)
identical(coef_list, coef_list2)
#[1] TRUE
smry_list <- lapply(lm.out, summary)
smry_list2 <- lapply(lm_out2, summary)
pval_list <- sapply(smry_list, \(fit) fit$coefficients[, "Pr(>|t|)"])
pval_list2 <- sapply(smry_list2, \(fit) fit$coefficients[, "Pr(>|t|)"])
identical(pval_list, pval_list2)
#[1] TRUE
R2_list <- sapply(smry_list, "[[", "r.squared")
R2_list2 <- sapply(smry_list2, "[[", "r.squared")
identical(R2_list, R2_list2)
#[1] TRUE
I'm tuning parameters with custom summaryFunction in caret.
I originally thought that if I set K-fold cross validation and input data has N points, performance will be measured with N/K data points.
However, apparently it seems not correct because when I extract data$pred by using browser() which is the handed data to summary function, it only had 10 data.
Since the input(df) has over 500 data points, this number is way smaller than my expectation.
Why does it only have 10 data? Is there any way to increase this?(=performance testing with more large data points)
Any kind of help is needed. Thank you.
sigma.list <- seq(1, 5, 1)
c.list <- seq(1, 10, 1)
met <- "FValue"
#define evaluation function
eval <- function(data, lev = NULL, model = NULL){
mat <- table(data$pred, data$obs)
pre <- mat[1,1]/sum(mat[1,]) #precision
rec <- mat[1,1]/sum(mat[,1]) #recall
res <- c("Precision"=pre, "Recall"=rec, "FValue"=2*pre*rec/(pre+rec))
browser()
res
}
#define train control
tc <- trainControl(method = "cv",
number = 5,
summaryFunction = eval,
classProbs = TRUE,
)
#tune with caret
svm.tune <- train(Flag~.,
data = df,
method = "svmRadial",
tuneGrid = expand.grid(C=c.list, sigma=sigma.list),
trControl = tc,
metric = met
)
After tracking this down, it appears this is normal caret behavior.
I think that caret is essentially verifying that your summaryFunction is working properly by passing fake data (of length 10) to it. The function inside caret that is doing this is evalSummaryFunction.
I'm not quite sure what I'm doing in the RStudio's debugger but this code in train.default:
testSummary <- evalSummaryFunction(y, wts = weights,
ctrl = trControl, lev = classLevels, metric = metric,
method = method)
perfNames <- names(testSummary)
calls evalSummaryFunction which looks like:
function (y, wts = NULL, perf = NULL, ctrl, lev, metric, method)
{
n <- if (class(y)[1] == "Surv")
nrow(y)
else length(y)
if (class(y)[1] != "Surv") {
if (is.factor(y)) {
values <- rep_len(levels(y), min(10, n))
pred_samp <- factor(sample(values), levels = lev)
obs_samp <- factor(sample(values), levels = lev)
}
else {
pred_samp <- sample(y, min(10, n))
obs_samp <- sample(y, min(10, n))
}
}
else {
pred_samp <- y[sample(1:n, min(10, n)), "time"]
obs_samp <- y[sample(1:n, min(10, n)), ]
}
testOutput <- data.frame(pred = pred_samp, obs = obs_samp)
if (!is.null(perf)) {
if (is.vector(perf))
stop("`perf` should be a data frame", call. = FALSE)
perf <- perf[sample(1:nrow(perf), nrow(testOutput)),
, drop = FALSE]
testOutput <- cbind(testOutput, perf)
}
if (ctrl$classProbs) {
for (i in seq(along = lev)) testOutput[, lev[i]] <- runif(nrow(testOutput))
testOutput[, lev] <- t(apply(testOutput[, lev], 1, function(x) x/sum(x)))
}
else {
if (metric == "ROC" & !ctrl$classProbs)
stop("train()'s use of ROC codes requires class probabilities. See the classProbs option of trainControl()")
}
if (!is.null(wts))
testOutput$weights <- sample(wts, min(10, length(wts)))
testOutput$rowIndex <- sample(1:n, size = nrow(testOutput))
ctrl$summaryFunction(testOutput, lev, method)
}
It appears that 10 is the length of fake data caret passes to your summary function to evaluate it (make sure it is working properly?).
If anyone can verify/explain better that this is what caret is actually doing, please post.
I'm wanting to write a function that will (hopefully) work in the raster calculator in the raster package. What I'm trying to do is regress each cell value against a vector of Time. There are multiple examples of this, but what I would like to do is for the method to try 1 type of regression (gls, controlling for AR1 residual errors), but if for some reason that regression throws an error (perhaps there is no AR1 structure in the residuals) then to revert back to simple OLS regression.
I've written two functions for the regression. One for gls:
# function for calculating the trend, variability, SNR, and residuals for each pixel
## this function will control for AR1 structure in the residuals
funTrAR1 <- function(x, ...) {if (sum(is.na(x)) >= 1) { NA } else {
mod <- nlme::gls(x ~ Year, na = na.omit, method = "REML", verbose = TRUE,
correlation = corAR1(form = ~ Year, fixed = FALSE),
control = glsControl(tolerance = 1e-3, msTol = 1e-3, opt = c("nlminb", "optim"),
singular.ok = TRUE, maxIter = 1000, msMaxIter = 1000))
slope <- mod$coefficients[2]
names(slope) <- "Trend"
var <- sd(mod$residuals)
names(var) <- "Variability"
snr <- slope/var
names(snr) <- "SNR"
residuals <- c(stats::quantile(
mod$residuals, probs = seq(0,1,0.25),
na.rm = TRUE, names = TRUE, type = 8),
base::mean(mod$residuals, na.rm = TRUE))
names(residuals) <- c("P0", "P25", "P50", "P75", "P100", "AvgResid")
return(c(slope, var, snr, residuals))}
}
and for OLS:
# function for calculating the trend, variability, SNR, and residuals for each pixel
## this function performs simple OLS
funTrOLS <- function(x, ...) {if (sum(is.na(x)) >= 1) { NA } else {
mod <- lm(x ~ Year, na.action = na.omit)
slope <- mod$coefficients[2]
names(slope) <- "TrendOLS"
var <- sd(mod$residuals)
names(var) <- "VariabilityOLS"
snr <- slope/var
names(snr) <- "SNROLS"
residuals <- c(stats::quantile(
mod$residuals, probs = seq(0,1,0.25),
na.rm = TRUE, names = TRUE, type = 8),
base::mean(mod$residuals, na.rm = TRUE))
names(residuals) <- c("P0", "P25", "P50", "P75", "P100", "AvgResid")
return(c(slope, var, snr, residuals))}
}
I'm trying to wrap these in a tryCatch expression which can be passed to raster::calc
xReg <- tryCatch(
{
funTrAR1
},
error = function(e) {
## this should create a text file if a model throws an error
sink(paste0(inDir, "/Outputs/localOLSErrors.txt"), append = TRUE)
cat(paste0("Used OLS regression (grid-cell) for model: ", m, ". Scenario: ", t, ". Variable: ", v, ". Realisation/Ensemble: ", r, ". \n"))
sink()
## run the second regression function
funTrOLS
}
)
This function is then passed to raster::calc like so
cellResults <- calc(rasterStack, fun = xReg)
Which if everything works will produce a raster stack of the output variables that looks similar to this
However, for some of my datasets the loop that I'm running all of this in stops and I receive the following error:
Error in nlme::gls(x ~ Year, na = na.omit, method = "REML", verbose = TRUE, :
false convergence (8)
Which is directly from nlme::gls and what I was hoping to avoid. I've never used tryCatch before (this might be very obvious), but does anyone know how to get the tryCatch() to move to the second regression function if the first (AR1) regression fails?
Here is another way to code this, perhaps that helps:
xReg <- function(x, ...) {
r <- try(funTrAR1(x, ...), silent=TRUE)
# if (class(r) == 'try-error') {
if (!is.numeric(r)) { # perhaps a faster test than the one above
r <- c(funTrOLS(x, ...), 2)
} else {
r <- c(r, 1)
}
r
}
I add a layer that shows which model was used for each cell.
You can also do
xReg <- function(x, ...) {
r <- funTrOLS(x, ...)
try( r <- funTrAR1(x, ...), silent=TRUE)
r
}
Or use calc twice and use cover after that
xReg1 <- function(x, ...) {
r <- c(NA, NA, NA, NA)
try( r <- funTrAR1(x, ...), silent=TRUE)
r
}
xReg2 <- function(x, ...) {
funTrOLS(x, ...)
}
a <- calc(rasterStack, xReg1)
b <- calc(rasterStack, xReg2)
d <- cover(a, b)
And a will show you where xReg1 failed.
After doing a bit more reading, and also looking at #RobertH answer, I wrote a bit of (very) ugly code that checks if GLS will fail and if it does, performs OLS instead. I'm positive that there is a nicer way to do this, but it works and maintains raster layer names as they were defined in my functions, it also exports any errors to a txt file.
for (i in 1) {
j <- tempCentredRas
cat(paste("Checking to see if gls(AR1) will work for model", m, r,"cell based calculations\n", sep = " "))
### This check is particularly annoying as it has to do this for every grid-cell
### it therefore has to perform GLS/OLS on every grid cell twice
### First to check if it (GLS) will fail, and then again if it does fail (use OLS) or doesn't (use GLS)
possibleLocalError <- tryCatch(
raster::calc(j, fun = funTrAR1),
error = function(err)
err
)
if (inherits(possibleLocalError, "error")) {
cat(paste("GLS regression failed for model", m, r, "using OLS instead for cell based results.","\n", sep = " "))
cellResults <- raster::calc(j, fun = funTrOLS)
} else {
cellResults <- raster::calc(j, fun = funTrAR1)
}
}
Consider a list of models that can be created by:
fits = vector(mode="list",length=10)
for(i in 1:10)
{
fits[[i]] = lm(nox~poly(dis,i),data=Boston)
}
Where, Boston dataset is used, that can be found in the MASS library.
Now, in order to make predictions:
dislim = range(Boston$dis)
dis.grid = seq(from = dislim[1],to = dislim[2],by = 0.1)
This is done to give values of dis upon which nox's values are predicted.
Now, in order to make predictions, we can do the following:
predict(fits[[1]],list(dis = dis.grid))
But this results in an error:
Error: variable 'poly(dis, i)' was fitted with type "nmatrix.1" but type "nmatrix.10" was supplied
In addition: Warning message:
In Z/rep(sqrt(norm2[-1L]), each = length(x)) :
longer object length is not a multiple of shorter object length
But, when I do the following:
lm.Boston = lm(nox~poly(dis,3),data=Boston)
lm.Boston.pred = predict(lm.Boston,list(dis = dis.grid))
It works fine. So, why can't I do that in the case of a list?
The correct way to specify a dynamic formula is to use paste and as.formula
library(MASS)
data(Boston)
dislim <- range(Boston$dis)
dis.grid <- seq(from = dislim[1],to = dislim[2],by = 0.1)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
to predict
lapply(models, function(x){
predict(x, list(dis = dis.grid))
})
EDIT: Another way to build the formula (as per MrFlick comment) is:
`lm(bquote(nox~poly(dis,.(i))), data=Boston)`
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
Additionally (as per Nathan Werth comment) if the formulation:
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
is used, the i is being treated as a variable in the model and it is possible to exploit such behavior in the following way:
predict(models2[[1]], list(dis = dis.grid, i = 1)
library(purrr)
models <- lapply(1:10, function(i){
form = as.formula(paste0("nox~", "poly(dis," , i, ")"))
lm(form, data=Boston)
})
models1 <- lapply(1:10, function(i){
lm(bquote(nox~poly(dis,.(i))), data=Boston)
})
models2 <- lapply(1:10, function(i){
lm(nox~poly(dis,i),data=Boston)
})
missuse <- lapply(models, function(x){
predict(x,list(dis = dis.grid))
})
MrFlick <- lapply(models1, function(x){
predict(x,list(dis = dis.grid))
})
NathanWerth <- purrr::map2(models2, 1:10, function(x, y){
predict(x,list(dis = dis.grid, i = y ))
})
purrr::pmap(list(missuse, MrFlick, NathanWerth), function(x, y, z) c(identical(x, y), identical(x, z)))
I know the theoretical answer to the question of my title, which is discussed here or in this previous question on Stack Overflow. My problem is that, even considering some numerical roundings, the probability weights I compute using the coefficients fitted in the R function multinom are quite different from the weights directly obtained from the same function (through predict(fit, newdata = dat, "probs")). I tried to numerically compute these weights in Java and R, and in both implementations I obtain the same results, which are in fact different from the values returned from predict.
Do you know how I may discover the implementation of the function predict(..., "probs") for the R function multinom?
I first install nnet and open the help page for nnet function. I see that the function creates a nnet object.
I trypredict.nnet but nothing comes up. This means either the package is not loaded, the function doesn't exist or it's hidden. methods("predict") reveals that the object is actually hidden (indicated by the *).
> methods("predict")
[1] predict.ar* predict.Arima* predict.arima0* predict.glm
[5] predict.HoltWinters* predict.lm predict.loess* predict.mlm
[9] predict.multinom* predict.nls* predict.nnet* predict.poly
[13] predict.ppr* predict.prcomp* predict.princomp* predict.smooth.spline*
[17] predict.smooth.spline.fit* predict.StructTS*
Calling this function explicitly reveals its code.
> nnet:::predict.nnet
function (object, newdata, type = c("raw", "class"), ...)
{
if (!inherits(object, "nnet"))
stop("object not of class \"nnet\"")
type <- match.arg(type)
if (missing(newdata))
z <- fitted(object)
else {
if (inherits(object, "nnet.formula")) {
newdata <- as.data.frame(newdata)
rn <- row.names(newdata)
Terms <- delete.response(object$terms)
m <- model.frame(Terms, newdata, na.action = na.omit,
xlev = object$xlevels)
if (!is.null(cl <- attr(Terms, "dataClasses")))
.checkMFClasses(cl, m)
keep <- match(row.names(m), rn)
x <- model.matrix(Terms, m, contrasts = object$contrasts)
xint <- match("(Intercept)", colnames(x), nomatch = 0L)
if (xint > 0L)
x <- x[, -xint, drop = FALSE]
}
else {
if (is.null(dim(newdata)))
dim(newdata) <- c(1L, length(newdata))
x <- as.matrix(newdata)
if (any(is.na(x)))
stop("missing values in 'x'")
keep <- 1L:nrow(x)
rn <- rownames(x)
}
ntr <- nrow(x)
nout <- object$n[3L]
.C(VR_set_net, as.integer(object$n), as.integer(object$nconn),
as.integer(object$conn), rep(0, length(object$wts)),
as.integer(object$nsunits), as.integer(0L), as.integer(object$softmax),
as.integer(object$censored))
z <- matrix(NA, nrow(newdata), nout, dimnames = list(rn,
dimnames(object$fitted.values)[[2L]]))
z[keep, ] <- matrix(.C(VR_nntest, as.integer(ntr), as.double(x),
tclass = double(ntr * nout), as.double(object$wts))$tclass,
ntr, nout)
.C(VR_unset_net)
}
switch(type, raw = z, class = {
if (is.null(object$lev)) stop("inappropriate fit for class")
if (ncol(z) > 1L) object$lev[max.col(z)] else object$lev[1L +
(z > 0.5)]
})
}
<bytecode: 0x0000000009305fd8>
<environment: namespace:nnet>