R C5.0: Error while including minCases in tunegrid (caret) - r

i am trying to implement the minCases-argument into my tuning process of a c5.0 model.
As i am using the caret package i am trying to get that argument into the "tuneGrid".
For that purpose i found the following Tutorial.
https://www.euclidean.com/machine-learning-in-practice/2015/6/12/r-caret-and-parameter-tuning-c50
After implementing the code into my syntax i get the following error:
**Error: The tuning parameter grid should have columns NA, NA, NA, splits**
Anyone knows where there is a mistake?
The error occurs as soon as i am building my model "mdl" in the last line of the code.
With regard to the Tutorial mentionend above my current code is the following:
library(datasets)
data(iris)
library('gmodels')
library("RcppCNPy")
library("class")
library("C50")
library('caret')
library('mlbench')
####Customizing the C5.0
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, c("model", "winnow","splits"), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow == loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
# add the splits parameter to the fit function
# minCases is a function of splits
theDots <- list(...)
splits <- param$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
# get the default C5.0 model functions
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
# modify the parameters data frame so that it includes splits
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(levels(c5ModelInfo$parameters$parameter),'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(levels(c5ModelInfo$parameters$label),'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))
# replace the default c5.0 functions with ones that are aware of the splits parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$grid <- C5CustomGrid
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
#Building the actual model
x_a <- iris[c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")]
y_a <-as.factor(iris[,c("Species")])
fitControl <- trainControl(method = "cv", number = 10)
grida <- expand.grid( .winnow = "FALSE", .trials=c(1,5,10,15,20), .model="tree", .splits=c(2,5,10,15,20,25,50,100) )
mdl<- train(x=x_a,y=y_a,tuneGrid=grida,trControl=fitControl,method=c5info)

the problem seems to be in some of the Custom functions, i have this other version that works for me:
library(caret)
library(C50)
library(mlbench)
library(tidyverse)
library(plyr)
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, .(winnow,model, splits,trials), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow ==
loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials !=
loop$trials[i]],winnow = loop$winnow[i], model=loop$model[i],splits=loop$splits[i])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
splits <- loop$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(c5ModelInfo$parameters$parameter,'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(c5ModelInfo$parameters$label,'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
splits<-c(5,25,100)
grid <- expand.grid( winnow = c(FALSE), trials=c(5,6), model=c("tree"), splits=c(5,25,100) )
data(PimaIndiansDiabetes2)
x <- PimaIndiansDiabetes2[c("age","glucose","insulin","mass","pedigree","pregnant","pressure","triceps")]
y <- PimaIndiansDiabetes2$diabetes
mdl<- train(x=x,y=y,tuneGrid=grid,trControl=fitControl,method=c5info,verbose=FALSE)

Related

Error in ans[, 1] : incorrect number of dimensions while running a linear model

I am doing a GAMLSS model, this linear model could do iterations automatically until it could get a best combinations of explanatory variables. After I put some explanatory variables in the model, it was still good in iteration process at first several rounds, then I got a Error like this.
Model with term Spr_Tmean has failed
Model with term Spr_Psum has failed
Model with term Spr_sdmean has failed
Model with term Spr_Wsum has failed
Model with term Sum_Tmean has failed
Model with term Sum_Psum has failed
Model with term Sum_sdmean has failed
Model with term Sum_Wsum has failed
Error in ans[, 1] : incorrect number of dimensions
I also checked some questions related to Error in xxx[,1]: incorrect number of dimensions, but i think this is not what i want.
I also list the source function in here, you could search "ans[, 1]" to locate the problem. What "ans[, 1]" means in here? I am not professional to check this function, so any answer about the reason caused this Error, and how to solve this problem would be welcome. Thank you in advance.
> stepGAICAll.B
function (object, scope, direction = c("both", "backward",
"forward"), trace = T, keep = NULL, steps = 1000, scale = 0,
k = 2, parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...)
{
mydeviance <- function(x, ...) {
dev <- deviance(x)
if (!is.null(dev))
dev
else extractAIC(x, k = 0)[2]
}
cut.string <- function(string) {
if (length(string) > 1)
string[-1] <- paste("\n", string[-1], sep = "")
string
}
re.arrange <- function(keep) {
namr <- names(k1 <- keep[[1]])
namc <- names(keep)
nc <- length(keep)
nr <- length(k1)
array(unlist(keep, recursive = FALSE), c(nr, nc), list(namr,
namc))
}
step.results <- function(models, fit, object, usingCp = FALSE) {
change <- sapply(models, "[[", "change")
rd <- sapply(models, "[[", "deviance")
dd <- c(NA, abs(diff(rd)))
rdf <- sapply(models, "[[", "df.resid")
ddf <- c(NA, abs(diff(rdf)))
AIC <- sapply(models, "[[", "AIC")
heading <- c("Stepwise Model Path \nAnalysis of Deviance Table",
"\nInitial Model:", deparse(as.vector(formula(object))),
"\nFinal Model:", deparse(as.vector(formula(fit))),
"\n")
aod <- if (usingCp)
data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
Cp = AIC, check.names = FALSE)
else data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
AIC = AIC, check.names = FALSE)
attr(aod, "heading") <- heading
class(aod) <- c("Anova", "data.frame")
fit$anova <- aod
fit
}
droptermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
drop1.scope <- function(terms1, terms2) {
terms1 <- terms(terms1, "mu")
f2 <- if (missing(terms2))
numeric(0)
else attr(terms(terms2, "mu"), "factor")
factor.scope(attr(terms1, "factor"), list(drop = f2))$drop
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
tl <- attr(terms(object, "mu"), "term.labels")
if (missing(scope)) {
scope <- drop1.scope(object)
}
else {
if (!is.character(scope))
scope <- attr(terms(update.formula(formula(object,
"mu"), scope), "mu"), "term.labels")
if (!all(match(scope, tl, FALSE)))
stop("scope is not a subset of term labels")
}
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . -",
term)), what = "All", evaluate = FALSE,
trace = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[1, 1] - ans[, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev - dev[1]
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term deletions", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
addtermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
add.scope <- function(terms1, terms2) {
terms1 <- terms(terms1)
terms2 <- terms(terms2)
factor.scope(attr(terms1, "factor"), list(add = attr(terms2,
"factor")))$add
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
if (missing(scope) || is.null(scope))
stop("no terms in scope")
if (!is.character(scope))
scope <- add.scope(object, terms(update.formula(formula(object,
"mu"), scope)))
if (!length(scope))
stop("no terms in scope for adding to object")
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . +",
term)), what = "All", trace = FALSE, evaluate = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[, 1] - ans[1, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev[1] - dev
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term additions for", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
if (missing(parallel))
parallel <- "no"
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
if (parallel == "multicore")
have_mc <- .Platform$OS.type != "windows"
else if (parallel == "snow")
have_snow <- TRUE
if (!have_mc && !have_snow)
ncpus <- 1L
loadNamespace("parallel")
}
if (have_snow) {
cl <- parallel::makeForkCluster(ncpus)
if (RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
on.exit(parallel::stopCluster(cl))
}
Terms <- terms(object)
object$formula <- Terms
object$call$formula <- Terms
md <- missing(direction)
direction <- match.arg(direction)
backward <- direction == "both" | direction == "backward"
forward <- direction == "both" | direction == "forward"
if (missing(scope)) {
fdrop <- numeric(0)
fadd <- attr(Terms, "factors")
if (md)
forward <- FALSE
}
else {
if (is.list(scope)) {
fdrop <- if (!is.null(fdrop <- scope$lower))
attr(terms(update.formula(formula(object, what = "mu"),
fdrop), what = "mu"), "factors")
else numeric(0)
fadd <- if (!is.null(fadd <- scope$upper))
attr(terms(update.formula(formula(object, what = "mu"),
fadd), what = "mu"), "factors")
}
else {
fadd <- if (!is.null(fadd <- scope))
attr(terms(update.formula(formula(object, what = "mu"),
scope), what = "mu"), "factors")
fdrop <- numeric(0)
}
}
models <- vector("list", steps)
if (!is.null(keep))
keep.list <- vector("list", steps)
if (is.list(object) && (nmm <- match("nobs", names(object),
0)) > 0)
n <- object[[nmm]]
else n <- length(residuals(object))
fit <- object
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (is.na(bAIC))
stop("AIC is not defined for this model, so stepAIC cannot proceed")
nm <- 1
Terms <- terms(fit, "mu")
if (trace)
cat("Start: AIC=", format(round(bAIC, 2)), "\n",
cut.string(deparse(as.vector(formula(fit, what = "mu")))),
"\n\n")
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = "", AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
usingCp <- FALSE
while (steps > 0) {
steps <- steps - 1
AIC <- bAIC
ffac <- attr(Terms, "factors")
if (!is.null(sp <- attr(Terms, "specials")) &&
!is.null(st <- sp$strata))
ffac <- ffac[-st, ]
scope <- factor.scope(ffac, list(add = fadd, drop = fdrop))
aod <- NULL
change <- NULL
if (backward && length(scope$drop)) {
aod <- droptermAllP(fit, scope$drop, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aod)
row.names(aod) <- c(rn[1], paste("-", rn[-1],
sep = " "))
if (any(aod$Df == 0, na.rm = TRUE)) {
zdf <- aod$Df == 0 & !is.na(aod$Df)
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
ch <- abs(aod[zdf, nc] - aod[1, nc]) > 0.01
if (any(ch)) {
warning("0 df terms are changing AIC")
zdf <- zdf[!ch]
}
if (length(zdf) > 0)
change <- rev(rownames(aod)[zdf])[1]
}
}
if (is.null(change)) {
if (forward && length(scope$add)) {
aodf <- addtermAllP(fit, scope$add, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aodf)
row.names(aodf) <- c(rn[1], paste("+",
rn[-1], sep = " "))
aod <- if (is.null(aod))
aodf
else rbind(aod, aodf[-1, , drop = FALSE])
}
attr(aod, "heading") <- NULL
if (is.null(aod) || ncol(aod) == 0)
break
nzdf <- if (!is.null(aod$Df))
aod$Df != 0 | is.na(aod$Df)
aod <- aod[nzdf, ]
if (is.null(aod) || ncol(aod) == 0)
break
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
o <- order(aod[, nc])
if (trace)
print(aod[o, ])
if (o[1] == 1)
break
change <- rownames(aod)[o[1]]
}
usingCp <- match("Cp", names(aod), 0) > 0
fit <- update(fit, paste("~ .", change), evaluate = FALSE,
what = "All", trace = FALSE)
fit <- eval.parent(fit)
if (is.list(fit) && (nmm <- match("nobs", names(fit),
0)) > 0)
nnew <- fit[[nmm]]
else nnew <- length(residuals(fit))
if (nnew != n)
stop("number of rows in use has changed: remove missing values?")
Terms <- terms(fit, "mu")
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (trace)
cat("\nStep: AIC=", format(round(bAIC, 2)),
"\n", cut.string(deparse(as.vector(formula(fit,
"mu")))), "\n\n")
if (bAIC >= AIC + 1e-07)
break
nm <- nm + 1
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = change, AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
}
if (!is.null(keep))
fit$keep <- re.arrange(keep.list[seq(nm)])
step.results(models = models[seq(nm)], fit, object, usingCp)
}
<bytecode: 0x0000026ddc5c40e8>
<environment: namespace:gamlss>
Not sure about your problem, but I prefer using
stepGAICAll.A()

Intialized vector in AUC function out of bounds

I am trying to use cross validation with a decision tree using AUC. These are the functions that I am using:
.cvFolds <- function(Y, V) {
Y0 <- split(sample(which(Y == 0)), rep(1:V, length = length(which(Y == 0))))
Y1 <- split(sample(which(Y == 1)), rep(1:V, length = length(which(Y == 1))))
folds <- vector("list", length = V)
for (v in seq(V)) folds[[v]] <- c(Y0[[v]], Y1[[v]])
return(folds)
}
.doFit <- function(V, folds, train) {
set.seed(v)
ycol <- which(names(train) == y)
params <- list(x = train[-folds[[V]], -ycol],
y = as.factor(train[-folds[[V]], ycol]),
xtest = train[folds[[V]], -ycol])
fit <- do.call(randomForest, params)
pred <- fit$test$votes[, 2]
return(pred)
}
This is the function to calculate probabilities:
iid_example <- function(train, y = "V1", V = 10, seed = 1) {
set.seed(seed)
folds <- .cvFolds(Y = train[, c(y)], V = V)
# Generate CV predicted values
cl <- makeCluster(detectCores())
registerDoParallel(cl)
predictions <- foreach(v = 1:V, .combine = "c",
.packages = c("randomForest")) %dopar% .doFit(v, folds, train)
stopCluster(cl)
predictions[unlist(folds)] <- predictions
# Get CV AUC
runtime <- system.time(res <- ci.cvAUC(predictions = predictions,
labels = train[, c(y)],
folds = folds,
confidence = 0.95))
print(runtime)
return(res)
}
The actual function call:
res <- iid_example(train = datos, y = "V1", V = 10, seed = 1)
When I try to run it, I get the following error:
Y0[[v]] out of bounds
I am trying to adjust the parameterization of the function, but I do not understand why it is out of boundaries. Thanks for your help

R: Caret rfe with a customized variable importance permimp

I want to run a recursive feature elimination with caret rfe() with the alternative variable importance algorithm permimp. The permimp permutation importance uses cforest with cforest_unbiased(). Which other caret functions do I need to customize in order run rfe with permimp() and cforest?
This is my code so far:
library(caret)
permimpRFE <- list(summary = defaultSummary,
fit = function(x, y, first, last, ...){
library(party)
tmp <- as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
party::cforest(y ~ ., data = tmp,
control = party::cforest_unbiased(ntree = 50))
},
pred = function(object, x) predict(object, x),
rank = function(object, x, y) {
library(permimp)
vimp <- permimp::permimp(object, conditional = TRUE, threshold = .95, do_check = FALSE)
vimp <- as.data.frame(vimp$values)
colnames(vimp) <- "Overall"
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
if (ncol(x) == 1) {
vimp$var <- colnames(x)
} else vimp$var <- rownames(vimp)
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars)
# specify rfeControl
contr <- caret::rfeControl(functions=permimpRFE, method="repeatedcv", number=3, repeats=2,
saveDetails = TRUE)
dat <- as.data.frame(ChickWeight)[1:50,]
preds <- dat[,2:4]
response <- dat[,1]
# recursive feature elimination caret (Algorithm 2)
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
results <- caret::rfe(x = preds,
y = response,
sizes=c(1:3),
metric= "RMSE",
rfeControl=contr)
I get the error Error in { : task 1 failed - "invalid 'x' type in 'x && y'"
How can I get the rfe running with permimp and cforest?
When trying to customize the variable importance function for rfe(), it is important to implement the exact syntax of the desired function as well as its dependent functions.
In my case, I had to change predict(object, x) to predict(object, newdata = x).
Now the code snippet is working:
library(caret)
permimpRFE <- list(summary = defaultSummary,
fit = function(x, y, first, last, ...){
library(party)
tmp <- as.data.frame(x, stringsAsFactors = TRUE)
tmp$y <- y
party::cforest(y ~ ., data = tmp,
control = party::cforest_unbiased(ntree = 50))
},
pred = function(object, x) {
x <- as.data.frame(x, stringsAsFactors = TRUE)
predict(object, newdata = x)
},
rank = function(object, x, y) {
library(permimp)
vimp <- permimp::permimp(object, conditional = TRUE, threshold = .95, do_check = FALSE)
vimp <- as.data.frame(vimp$values)
colnames(vimp) <- "Overall"
vimp <- vimp[order(vimp$Overall, decreasing = TRUE),, drop = FALSE]
if (ncol(x) == 1) {
vimp$var <- colnames(x)
} else vimp$var <- rownames(vimp)
vimp
},
selectSize = pickSizeBest,
selectVar = pickVars)
# specify rfeControl
contr <- caret::rfeControl(functions=permimpRFE, method="repeatedcv", number=3, repeats=2,
saveDetails = TRUE)
dat <- as.data.frame(ChickWeight)[1:50,]
preds <- dat[,2:4]
response <- dat[,1]
# recursive feature elimination caret (Algorithm 2)
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
results <- caret::rfe(x = preds,
y = response,
sizes=c(1:3),
metric= "RMSE",
rfeControl=contr)

Reproducing results from previous answer is not working due to using new version of lme4

I have tried to reproduce the results from the answers for this question “Estimating random effects and applying user defined correlation/covariance structure with R lme4 or nlme package “ https://stats.stackexchange.com/questions/18563/estimating-random-effects-and-applying-user-defined-correlation-covariance-struc
Aaron Rendahl's codes
library(pedigreemm)
relmatmm <- function (formula, data, family = NULL, REML = TRUE, relmat = list(),
control = list(), start = NULL, verbose = FALSE, subset,
weights, na.action, offset, contrasts = NULL, model = TRUE,
x = TRUE, ...)
{
mc <- match.call()
lmerc <- mc
lmerc[[1]] <- as.name("lmer")
lmerc$relmat <- NULL
if (!length(relmat))
return(eval.parent(lmerc))
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
lmerc$doFit <- FALSE
lmf <- eval(lmerc, parent.frame())
relfac <- relmat
relnms <- names(relmat)
stopifnot(all(relnms %in% names(lmf$FL$fl)))
asgn <- attr(lmf$FL$fl, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(lmf$FL$fl)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be associated with only one random effects term")
Zt <- lmf$FL$trms[[tn]]$Zt
relmat[[i]] <- Matrix(relmat[[i]][rownames(Zt), rownames(Zt)],
sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
lmf$FL$trms[[tn]]$Zt <- lmf$FL$trms[[tn]]$A <- relfac[[i]] %*% Zt
}
ans <- do.call(if (!is.null(lmf$glmFit))
lme4:::glmer_finalize
else lme4:::lmer_finalize, lmf)
ans <- new("pedigreemm", relfac = relfac, ans)
ans#call <- match.call()
ans
}
the original example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat), data=mydata)
here is the error message
Error in lmf$FL : $ operator not defined for this S4 class
In addition: Warning message:
In checkArgs("lmer", doFit = FALSE) : extra argument(s) ‘doFit’ disregarded
I will appreciate any help ?
Thanks
This is a re-implementation of the previous code -- I have done some slight modifications, and I have not tested it in any way -- test yourself and/or use at your own risk.
First create a slightly more modularized function that constructs the deviance function and fits the model:
doFit <- function(lmod,lmm=TRUE) {
## see ?modular
if (lmm) {
devfun <- do.call(mkLmerDevfun, lmod)
opt <- optimizeLmer(devfun)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
} else {
devfun <- do.call(mkGlmerDevfun, lmod)
opt <- optimizeGlmer(devfun)
devfun <- updateGlmerDevfun(devfun, lmod$reTrms)
opt <- optimizeGlmer(devfun, stage=2)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
}
}
Now create a function to construct the object that doFit needs and modify it:
relmatmm <- function (formula, ..., lmm=TRUE, relmat = list()) {
ff <- if (lmm) lFormula(formula, ...) else glFormula(formula, ...)
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
relnms <- names(relmat)
relfac <- relmat
flist <- ff$reTrms[["flist"]] ## list of factors
## random-effects design matrix components
Ztlist <- ff$reTrms[["Ztlist"]]
stopifnot(all(relnms %in% names(flist)))
asgn <- attr(flist, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(flist)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be",
" associated with only one random effects term")
zn <- rownames(Ztlist[[i]])
relmat[[i]] <- Matrix(relmat[[i]][zn,zn],sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
Ztlist[[i]] <- relfac[[i]] %*% Ztlist[[i]]
}
ff$reTrms[["Ztlist"]] <- Ztlist
ff$reTrms[["Zt"]] <- do.call(rBind,Ztlist)
fit <- doFit(ff,lmm)
}
Example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat),
data=mydata)
This runs -- I don't know if the output is correct. It also doesn't make the resulting object into a pedigreemm object ...

Saving huge model object to file

Say you have a model object of class 'varrest' returned from a VAR() regression operation.
I want to save the model to a file, but not all data which was used to estimate the coefficients.
How can one just save the model specification wihtout the training data?
Because when I save the model it has a file size of over 1GB and therefore loading does take its time.
Can one save objects without some attributes?
The predict.varest function starts out with this code:
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
You can then investigate how much pruning you might achieve:
data(Canada)
tcan <-
VAR(Canada, p = 2, type = "trend")
names(tcan)
# [1] "varresult" "datamat" "y" "type" "p"
# [6] "K" "obs" "totobs" "restrictions" "call"
object.size(tcan[c("K","p", "obs", "type", "datamat", "y")] )
#15080 bytes
object.size(tcan)
#252032 bytes
So the difference is substantial, but just saving those items is not sufficient because the next line in predict.varest is:
B <- Bcoef(object)
You will need to add that object to the list above and then construct a new predict-function that accepts something less than the large 'varresult' node of the model object. Also turned out that there was a downstream call to an internal function that needs to be stored. (You will need to decide in advance what interval you need for prediction.)
tsmall <- c( tcan[c("K","p", "obs", "type", "datamat", "y", "call")] )
tsmall[["Bco"]] <- Bcoef(tcan)
tsmall$sig.y <- vars:::.fecov(x = tcan, n.ahead = 10)
And the modified predict function will be:
sm.predict <- function (object, ..., n.ahead = 10, ci = 0.95, dumvar = NULL)
{
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
n.ahead <- as.integer(n.ahead)
Z <- object$datamat[, -c(1:K)]
# This used to be a call to Bcoef(object)
B <- object$Bco
if (type == "const") {
Zdet <- matrix(rep(1, n.ahead), nrow = n.ahead, ncol = 1)
colnames(Zdet) <- "const"
}
else if (type == "trend") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(seq(trdstart, length = n.ahead), nrow = n.ahead,
ncol = 1)
colnames(Zdet) <- "trend"
}
else if (type == "both") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(c(rep(1, n.ahead), seq(trdstart, length = n.ahead)),
nrow = n.ahead, ncol = 2)
colnames(Zdet) <- c("const", "trend")
}
else if (type == "none") {
Zdet <- NULL
}
if (!is.null(eval(object$call$season))) {
season <- eval(object$call$season)
seas.names <- paste("sd", 1:(season - 1), sep = "")
cycle <- tail(data.all[, seas.names], season)
seasonal <- as.matrix(cycle, nrow = season, ncol = season -
1)
if (nrow(seasonal) >= n.ahead) {
seasonal <- as.matrix(cycle[1:n.ahead, ], nrow = n.ahead,
ncol = season - 1)
}
else {
while (nrow(seasonal) < n.ahead) {
seasonal <- rbind(seasonal, cycle)
}
seasonal <- seasonal[1:n.ahead, ]
}
rownames(seasonal) <- seq(nrow(data.all) + 1, length = n.ahead)
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, seasonal))
}
else {
Zdet <- as.matrix(seasonal)
}
}
if (!is.null(eval(object$call$exogen))) {
if (is.null(dumvar)) {
stop("\nNo matrix for dumvar supplied, but object varest contains exogenous variables.\n")
}
if (!all(colnames(dumvar) %in% colnames(data.all))) {
stop("\nColumn names of dumvar do not coincide with exogen.\n")
}
if (!identical(nrow(dumvar), n.ahead)) {
stop("\nRow number of dumvar is unequal to n.ahead.\n")
}
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, dumvar))
}
else {
Zdet <- as.matrix(dumvar)
}
}
Zy <- as.matrix(object$datamat[, 1:(K * (p + 1))])
yse <- matrix(NA, nrow = n.ahead, ncol = K)
# This used to be a call to vars:::.fecov
sig.y <- object$sig.y
for (i in 1:n.ahead) {
yse[i, ] <- sqrt(diag(sig.y[, , i]))
}
yse <- -1 * qnorm((1 - ci)/2) * yse
colnames(yse) <- paste(ci, "of", ynames)
forecast <- matrix(NA, ncol = K, nrow = n.ahead)
lasty <- c(Zy[nrow(Zy), ])
for (i in 1:n.ahead) {
lasty <- lasty[1:(K * p)]; print(lasty); print(B)
Z <- c(lasty, Zdet[i, ]) ;print(Z)
forecast[i, ] <- B %*% Z
temp <- forecast[i, ]
lasty <- c(temp, lasty)
}
colnames(forecast) <- paste(ynames, ".fcst", sep = "")
lower <- forecast - yse
colnames(lower) <- paste(ynames, ".lower", sep = "")
upper <- forecast + yse
colnames(upper) <- paste(ynames, ".upper", sep = "")
forecasts <- list()
for (i in 1:K) {
forecasts[[i]] <- cbind(forecast[, i], lower[, i], upper[,
i], yse[, i])
colnames(forecasts[[i]]) <- c("fcst", "lower", "upper",
"CI")
}
names(forecasts) <- ynames
result <- list(fcst = forecasts, endog = object$y, model = object,
exo.fcst = dumvar)
class(result) <- "varprd"
return(result)
}
Either
set the attributes you do not want to NULL, or
copy the parts you want to a new object, or
call the save() function with proper indexing.

Resources