I was using the ensmebleBMA package in R I and fitting gamma0() model for precipitation forecast in R
But it is taking a long time if I do it for a season and so many grids.
I am using the ensemblegamma0() function to fit the model.
Hope that it will be done fast if I can parallelize the function. I tried parallelizing a source code but was not working properly. I seek help from someone having any idea about it.
There is a for loop inside this function() (ensemblegamma0()) which will fit the model for multiple days as each day forecasting is independent which could be parallelized
I am sharing the method I tried here
sample method
library(future.apply)
plan(multiprocess) ## => parallelize on your local computer
X <- 1:5
y <- future_lapply(X, function(x) {
tmp <- sqrt(x)
tmp
})
source code used
ensembleBMAgamma0 <-
function(ensembleData, trainingDays, dates = NULL,
control = controlBMAgamma0(), exchangeable = NULL)
{
#
# copyright 2006-present, University of Washington. All rights reserved.
# for terms of use, see the LICENSE file
#
if (!inherits(ensembleData,"ensembleData")) stop("not an ensembleData object")
if (missing(trainingDays)) stop("trainingDays must be specified")
call <- match.call()
warmStart <- FALSE
if (missing(trainingDays)) stop("trainingDays must be specified")
ensMemNames <- ensembleMembers(ensembleData)
nForecasts <- length(ensMemNames)
exchangeable <- getExchangeable( exchangeable, ensembleGroups(ensembleData),
nForecasts)
# remove instances missing all forecasts, obs or dates
M <- !dataNA(ensembleData)
if (!all(M)) ensembleData <- ensembleData[M,]
nObs <- nrow(ensembleData)
if (!nObs) stop("no data")
Dates <- as.character(ensembleValidDates(ensembleData))
DATES <- sort(unique(Dates))
julianDATES <- ymdhTOjul(DATES)
incr <- min(1,min(diff(julianDATES))) ## incr may be fractional for hours
forecastHour <- ensembleFhour(ensembleData)
lag <- ceiling( forecastHour / 24 )
## dates that can be modeled by the training data (ignoring gaps)
dates <- getDates( DATES, julianDATES, dates, trainingDays, lag, incr)
juliandates <- ymdhTOjul(dates)
nDates <- length(dates)
if (is.null(control$prior)) {
# accomodates saved mean as an additional parameter
prob0coefs <- array( NA, c(3, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
else {
prob0coefs <- array( NA, c(4, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
biasCoefs <- array( NA, c(2, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
varCoefs <- array( NA, c(2, nDates), dimnames = list(NULL, dates))
weights <- array( NA, c(nForecasts, nDates),
dimnames = list(ensMemNames, dates))
trainTable <- rep(0, nDates)
names(trainTable) <- dates
nIter <- loglikelihood <- rep(0, nDates)
names(nIter) <- names(loglikelihood) <- dates
obs <- dataVerifObs(ensembleData)
K <- 1:nForecasts
L <- length(juliandates)
twin <- 1:trainingDays
cat("\n")
l <- 0
for(i in seq(along = juliandates)) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
}
edited source code for loop
y=future_lapply(juliandates, function (i) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
})
i am getting some error after running this
Related
I am doing a GAMLSS model, this linear model could do iterations automatically until it could get a best combinations of explanatory variables. After I put some explanatory variables in the model, it was still good in iteration process at first several rounds, then I got a Error like this.
Model with term Spr_Tmean has failed
Model with term Spr_Psum has failed
Model with term Spr_sdmean has failed
Model with term Spr_Wsum has failed
Model with term Sum_Tmean has failed
Model with term Sum_Psum has failed
Model with term Sum_sdmean has failed
Model with term Sum_Wsum has failed
Error in ans[, 1] : incorrect number of dimensions
I also checked some questions related to Error in xxx[,1]: incorrect number of dimensions, but i think this is not what i want.
I also list the source function in here, you could search "ans[, 1]" to locate the problem. What "ans[, 1]" means in here? I am not professional to check this function, so any answer about the reason caused this Error, and how to solve this problem would be welcome. Thank you in advance.
> stepGAICAll.B
function (object, scope, direction = c("both", "backward",
"forward"), trace = T, keep = NULL, steps = 1000, scale = 0,
k = 2, parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...)
{
mydeviance <- function(x, ...) {
dev <- deviance(x)
if (!is.null(dev))
dev
else extractAIC(x, k = 0)[2]
}
cut.string <- function(string) {
if (length(string) > 1)
string[-1] <- paste("\n", string[-1], sep = "")
string
}
re.arrange <- function(keep) {
namr <- names(k1 <- keep[[1]])
namc <- names(keep)
nc <- length(keep)
nr <- length(k1)
array(unlist(keep, recursive = FALSE), c(nr, nc), list(namr,
namc))
}
step.results <- function(models, fit, object, usingCp = FALSE) {
change <- sapply(models, "[[", "change")
rd <- sapply(models, "[[", "deviance")
dd <- c(NA, abs(diff(rd)))
rdf <- sapply(models, "[[", "df.resid")
ddf <- c(NA, abs(diff(rdf)))
AIC <- sapply(models, "[[", "AIC")
heading <- c("Stepwise Model Path \nAnalysis of Deviance Table",
"\nInitial Model:", deparse(as.vector(formula(object))),
"\nFinal Model:", deparse(as.vector(formula(fit))),
"\n")
aod <- if (usingCp)
data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
Cp = AIC, check.names = FALSE)
else data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
AIC = AIC, check.names = FALSE)
attr(aod, "heading") <- heading
class(aod) <- c("Anova", "data.frame")
fit$anova <- aod
fit
}
droptermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
drop1.scope <- function(terms1, terms2) {
terms1 <- terms(terms1, "mu")
f2 <- if (missing(terms2))
numeric(0)
else attr(terms(terms2, "mu"), "factor")
factor.scope(attr(terms1, "factor"), list(drop = f2))$drop
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
tl <- attr(terms(object, "mu"), "term.labels")
if (missing(scope)) {
scope <- drop1.scope(object)
}
else {
if (!is.character(scope))
scope <- attr(terms(update.formula(formula(object,
"mu"), scope), "mu"), "term.labels")
if (!all(match(scope, tl, FALSE)))
stop("scope is not a subset of term labels")
}
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . -",
term)), what = "All", evaluate = FALSE,
trace = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[1, 1] - ans[, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev - dev[1]
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term deletions", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
addtermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
add.scope <- function(terms1, terms2) {
terms1 <- terms(terms1)
terms2 <- terms(terms2)
factor.scope(attr(terms1, "factor"), list(add = attr(terms2,
"factor")))$add
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
if (missing(scope) || is.null(scope))
stop("no terms in scope")
if (!is.character(scope))
scope <- add.scope(object, terms(update.formula(formula(object,
"mu"), scope)))
if (!length(scope))
stop("no terms in scope for adding to object")
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . +",
term)), what = "All", trace = FALSE, evaluate = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[, 1] - ans[1, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev[1] - dev
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term additions for", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
if (missing(parallel))
parallel <- "no"
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
if (parallel == "multicore")
have_mc <- .Platform$OS.type != "windows"
else if (parallel == "snow")
have_snow <- TRUE
if (!have_mc && !have_snow)
ncpus <- 1L
loadNamespace("parallel")
}
if (have_snow) {
cl <- parallel::makeForkCluster(ncpus)
if (RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
on.exit(parallel::stopCluster(cl))
}
Terms <- terms(object)
object$formula <- Terms
object$call$formula <- Terms
md <- missing(direction)
direction <- match.arg(direction)
backward <- direction == "both" | direction == "backward"
forward <- direction == "both" | direction == "forward"
if (missing(scope)) {
fdrop <- numeric(0)
fadd <- attr(Terms, "factors")
if (md)
forward <- FALSE
}
else {
if (is.list(scope)) {
fdrop <- if (!is.null(fdrop <- scope$lower))
attr(terms(update.formula(formula(object, what = "mu"),
fdrop), what = "mu"), "factors")
else numeric(0)
fadd <- if (!is.null(fadd <- scope$upper))
attr(terms(update.formula(formula(object, what = "mu"),
fadd), what = "mu"), "factors")
}
else {
fadd <- if (!is.null(fadd <- scope))
attr(terms(update.formula(formula(object, what = "mu"),
scope), what = "mu"), "factors")
fdrop <- numeric(0)
}
}
models <- vector("list", steps)
if (!is.null(keep))
keep.list <- vector("list", steps)
if (is.list(object) && (nmm <- match("nobs", names(object),
0)) > 0)
n <- object[[nmm]]
else n <- length(residuals(object))
fit <- object
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (is.na(bAIC))
stop("AIC is not defined for this model, so stepAIC cannot proceed")
nm <- 1
Terms <- terms(fit, "mu")
if (trace)
cat("Start: AIC=", format(round(bAIC, 2)), "\n",
cut.string(deparse(as.vector(formula(fit, what = "mu")))),
"\n\n")
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = "", AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
usingCp <- FALSE
while (steps > 0) {
steps <- steps - 1
AIC <- bAIC
ffac <- attr(Terms, "factors")
if (!is.null(sp <- attr(Terms, "specials")) &&
!is.null(st <- sp$strata))
ffac <- ffac[-st, ]
scope <- factor.scope(ffac, list(add = fadd, drop = fdrop))
aod <- NULL
change <- NULL
if (backward && length(scope$drop)) {
aod <- droptermAllP(fit, scope$drop, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aod)
row.names(aod) <- c(rn[1], paste("-", rn[-1],
sep = " "))
if (any(aod$Df == 0, na.rm = TRUE)) {
zdf <- aod$Df == 0 & !is.na(aod$Df)
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
ch <- abs(aod[zdf, nc] - aod[1, nc]) > 0.01
if (any(ch)) {
warning("0 df terms are changing AIC")
zdf <- zdf[!ch]
}
if (length(zdf) > 0)
change <- rev(rownames(aod)[zdf])[1]
}
}
if (is.null(change)) {
if (forward && length(scope$add)) {
aodf <- addtermAllP(fit, scope$add, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aodf)
row.names(aodf) <- c(rn[1], paste("+",
rn[-1], sep = " "))
aod <- if (is.null(aod))
aodf
else rbind(aod, aodf[-1, , drop = FALSE])
}
attr(aod, "heading") <- NULL
if (is.null(aod) || ncol(aod) == 0)
break
nzdf <- if (!is.null(aod$Df))
aod$Df != 0 | is.na(aod$Df)
aod <- aod[nzdf, ]
if (is.null(aod) || ncol(aod) == 0)
break
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
o <- order(aod[, nc])
if (trace)
print(aod[o, ])
if (o[1] == 1)
break
change <- rownames(aod)[o[1]]
}
usingCp <- match("Cp", names(aod), 0) > 0
fit <- update(fit, paste("~ .", change), evaluate = FALSE,
what = "All", trace = FALSE)
fit <- eval.parent(fit)
if (is.list(fit) && (nmm <- match("nobs", names(fit),
0)) > 0)
nnew <- fit[[nmm]]
else nnew <- length(residuals(fit))
if (nnew != n)
stop("number of rows in use has changed: remove missing values?")
Terms <- terms(fit, "mu")
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (trace)
cat("\nStep: AIC=", format(round(bAIC, 2)),
"\n", cut.string(deparse(as.vector(formula(fit,
"mu")))), "\n\n")
if (bAIC >= AIC + 1e-07)
break
nm <- nm + 1
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = change, AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
}
if (!is.null(keep))
fit$keep <- re.arrange(keep.list[seq(nm)])
step.results(models = models[seq(nm)], fit, object, usingCp)
}
<bytecode: 0x0000026ddc5c40e8>
<environment: namespace:gamlss>
Not sure about your problem, but I prefer using
stepGAICAll.A()
i am trying to implement the minCases-argument into my tuning process of a c5.0 model.
As i am using the caret package i am trying to get that argument into the "tuneGrid".
For that purpose i found the following Tutorial.
https://www.euclidean.com/machine-learning-in-practice/2015/6/12/r-caret-and-parameter-tuning-c50
After implementing the code into my syntax i get the following error:
**Error: The tuning parameter grid should have columns NA, NA, NA, splits**
Anyone knows where there is a mistake?
The error occurs as soon as i am building my model "mdl" in the last line of the code.
With regard to the Tutorial mentionend above my current code is the following:
library(datasets)
data(iris)
library('gmodels')
library("RcppCNPy")
library("class")
library("C50")
library('caret')
library('mlbench')
####Customizing the C5.0
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, c("model", "winnow","splits"), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow == loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials != loop$trials[i]])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
# add the splits parameter to the fit function
# minCases is a function of splits
theDots <- list(...)
splits <- param$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
# get the default C5.0 model functions
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
# modify the parameters data frame so that it includes splits
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(levels(c5ModelInfo$parameters$parameter),'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(levels(c5ModelInfo$parameters$label),'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))
# replace the default c5.0 functions with ones that are aware of the splits parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$grid <- C5CustomGrid
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
#Building the actual model
x_a <- iris[c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")]
y_a <-as.factor(iris[,c("Species")])
fitControl <- trainControl(method = "cv", number = 10)
grida <- expand.grid( .winnow = "FALSE", .trials=c(1,5,10,15,20), .model="tree", .splits=c(2,5,10,15,20,25,50,100) )
mdl<- train(x=x_a,y=y_a,tuneGrid=grida,trControl=fitControl,method=c5info)
the problem seems to be in some of the Custom functions, i have this other version that works for me:
library(caret)
library(C50)
library(mlbench)
library(tidyverse)
library(plyr)
C5CustomSort <- function(x) {
x$model <- factor(as.character(x$model), levels = c("rules","tree"))
x[order(x$trials, x$model, x$splits, !x$winnow),]
}
C5CustomLoop <- function (grid)
{
loop <- ddply(grid, .(winnow,model, splits,trials), function(x) c(trials = max(x$trials)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$trials)) {
index <- which(grid$model == loop$model[i] & grid$winnow ==
loop$winnow[i] & grid$splits == loop$splits[i])
trials <- grid[index, "trials"]
submodels[[i]] <- data.frame(trials = trials[trials !=
loop$trials[i]],winnow = loop$winnow[i], model=loop$model[i],splits=loop$splits[i])
}
list(loop = loop, submodels = submodels)
}
C5CustomGrid <- function(x, y, len = NULL) {
c5seq <- if(len == 1) 1 else c(1, 10*((2:min(len, 11)) - 1))
expand.grid(trials = c5seq, splits = c(2,10,20,50), winnow = c(TRUE, FALSE), model = c("tree","rules"))
}
C5CustomFit <- function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
splits <- loop$splits
minCases <- floor( length(y)/splits ) - 1
if(any(names(theDots) == "control"))
{
theDots$control$winnow <- param$winnow
theDots$control$minCases <- minCases
theDots$control$earlyStopping <- FALSE
}
else
theDots$control <- C5.0Control(winnow = param$winnow, minCases = minCases, earlyStopping=FALSE )
argList <- list(x = x, y = y, weights = wts, trials = param$trials, rules = param$model == "rules")
argList <- c(argList, theDots)
do.call("C5.0.default", argList)
}
GetC5Info <- function() {
c5ModelInfo <- getModelInfo(model = "C5.0", regex = FALSE)[[1]]
c5ModelInfo$parameters$parameter <- factor(c5ModelInfo$parameters$parameter,levels=c(c5ModelInfo$parameters$parameter,'splits'))
c5ModelInfo$parameters$label <- factor(c5ModelInfo$parameters$label,levels=c(c5ModelInfo$parameters$label,'Splits'))
c5ModelInfo$parameters <- rbind(c5ModelInfo$parameters,c('splits','numeric','Splits'))parameter
c5ModelInfo$fit <- C5CustomFit
c5ModelInfo$loop <- C5CustomLoop
c5ModelInfo$sort <- C5CustomSort
return (c5ModelInfo)
}
c5info <- GetC5Info()
fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
splits<-c(5,25,100)
grid <- expand.grid( winnow = c(FALSE), trials=c(5,6), model=c("tree"), splits=c(5,25,100) )
data(PimaIndiansDiabetes2)
x <- PimaIndiansDiabetes2[c("age","glucose","insulin","mass","pedigree","pregnant","pressure","triceps")]
y <- PimaIndiansDiabetes2$diabetes
mdl<- train(x=x,y=y,tuneGrid=grid,trControl=fitControl,method=c5info,verbose=FALSE)
I am trying to run Factor analysis for a dataset with around 150 variables but only have around around 80 observations.
I tried the factanal() function in R and R reported error:
Error in solve.default(cv) :
system is computationally singular: reciprocal condition number = 3.0804e-20
Any suggestions on alternative methods / packages?
A demonstration on a dummy dataset would be:
# This will work (dataset with 80 obs and 15 predictors)
set.seed(1234)
fake_df = as.data.frame(matrix(rnorm(80*15), nrow = 80))
factanal(fake_df, factors = 2, rotation = "varimax")
# This will not (dataset with 80 obs and 150 predictors)
set.seed(1234)
fake_df = as.data.frame(matrix(rnorm(80*150), nrow = 80))
factanal(fake_df, factors = 2, rotation = "varimax")
So far I've replaced the solve function in the factanal() source code with a numerical solving function one that I created below, but it did not resolve the issue:
solve_G = function(M){
library(matrixcalc)
if(!is.singular.matrix(M)){
return(solve(M))
} else{
s = svd(M)
U = s$u
V = s$v
D_Inv = diag(1/s$d)
Num_Inv = V %*% D_Inv %*% t(U)
cat("Singular Matrix! SVD Used.\n")
return(Num_Inv)
}
}
And after you replace "solve" with "solve_G", a new error occurred:
Error in factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt) :
could not find function "factanal.fit.mle"
P.S. Here is the new "factanal" function named my_factanal:
The error above occurred when running the line:
nfit <- factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt)
And to run this, Set x to be a 80* 150 numerical dataframe, set factors = 2, set scores = "regression", rotation = "varimax":
my_factanal = function (x, factors, data = NULL, covmat = NULL, n.obs = NA,
subset, na.action, start = NULL, scores = c("none", "regression",
"Bartlett"), rotation = "varimax", control = NULL, ...)
{
sortLoadings <- function(Lambda) {
cn <- colnames(Lambda)
Phi <- attr(Lambda, "covariance")
ssq <- apply(Lambda, 2L, function(x) -sum(x^2))
Lambda <- Lambda[, order(ssq), drop = FALSE]
colnames(Lambda) <- cn
neg <- colSums(Lambda) < 0
Lambda[, neg] <- -Lambda[, neg]
if (!is.null(Phi)) {
unit <- ifelse(neg, -1, 1)
attr(Lambda, "covariance") <- unit %*% Phi[order(ssq),
order(ssq)] %*% unit
}
Lambda
}
cl <- match.call()
na.act <- NULL
if (is.list(covmat)) {
if (any(is.na(match(c("cov", "n.obs"), names(covmat)))))
stop("'covmat' is not a valid covariance list")
cv <- covmat$cov
n.obs <- covmat$n.obs
have.x <- FALSE
}
else if (is.matrix(covmat)) {
cv <- covmat
have.x <- FALSE
}
else if (is.null(covmat)) {
if (missing(x))
stop("neither 'x' nor 'covmat' supplied")
have.x <- TRUE
if (inherits(x, "formula")) {
mt <- terms(x, data = data)
if (attr(mt, "response") > 0)
stop("response not allowed in formula")
attr(mt, "intercept") <- 0
mf <- match.call(expand.dots = FALSE)
names(mf)[names(mf) == "x"] <- "formula"
mf$factors <- mf$covmat <- mf$scores <- mf$start <- mf$rotation <- mf$control <- mf$... <- NULL
mf[[1L]] <- quote(stats::model.frame)
mf <- eval.parent(mf)
na.act <- attr(mf, "na.action")
if (.check_vars_numeric(mf))
stop("factor analysis applies only to numerical variables")
z <- model.matrix(mt, mf)
}
else {
z <- as.matrix(x)
if (!is.numeric(z))
stop("factor analysis applies only to numerical variables")
if (!missing(subset))
z <- z[subset, , drop = FALSE]
}
covmat <- cov.wt(z)
cv <- covmat$cov
n.obs <- covmat$n.obs
}
else stop("'covmat' is of unknown type")
scores <- match.arg(scores)
if (scores != "none" && !have.x)
stop("requested scores without an 'x' matrix")
p <- ncol(cv)
if (p < 3)
stop("factor analysis requires at least three variables")
dof <- 0.5 * ((p - factors)^2 - p - factors)
if (dof < 0)
stop(sprintf(ngettext(factors, "%d factor is too many for %d variables",
"%d factors are too many for %d variables"), factors,
p), domain = NA)
sds <- sqrt(diag(cv))
cv <- cv/(sds %o% sds)
cn <- list(nstart = 1, trace = FALSE, lower = 0.005)
cn[names(control)] <- control
more <- list(...)[c("nstart", "trace", "lower", "opt", "rotate")]
if (length(more))
cn[names(more)] <- more
if (is.null(start)) {
start <- (1 - 0.5 * factors/p)/diag(solve_G(cv))
if ((ns <- cn$nstart) > 1)
start <- cbind(start, matrix(runif(ns - 1), p, ns -
1, byrow = TRUE))
}
start <- as.matrix(start)
if (nrow(start) != p)
stop(sprintf(ngettext(p, "'start' must have %d row",
"'start' must have %d rows"), p), domain = NA)
nc <- ncol(start)
if (nc < 1)
stop("no starting values supplied")
best <- Inf
for (i in 1L:nc) {
nfit <- factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt)
if (cn$trace)
cat("start", i, "value:", format(nfit$criteria[1L]),
"uniqs:", format(as.vector(round(nfit$uniquenesses,
4))), "\\n")
if (nfit$converged && nfit$criteria[1L] < best) {
fit <- nfit
best <- fit$criteria[1L]
}
}
if (best == Inf)
stop(ngettext(nc, "unable to optimize from this starting value",
"unable to optimize from these starting values"),
domain = NA)
load <- fit$loadings
if (rotation != "none") {
rot <- do.call(rotation, c(list(load), cn$rotate))
load <- if (is.list(rot)) {
load <- rot$loadings
fit$rotmat <- if (inherits(rot, "GPArotation"))
t(solve_G(rot$Th))
else rot$rotmat
rot$loadings
}
else rot
}
fit$loadings <- sortLoadings(load)
class(fit$loadings) <- "loadings"
fit$na.action <- na.act
if (have.x && scores != "none") {
Lambda <- fit$loadings
zz <- scale(z, TRUE, TRUE)
switch(scores, regression = {
sc <- zz %*% solve(cv, Lambda)
if (!is.null(Phi <- attr(Lambda, "covariance"))) sc <- sc %*%
Phi
}, Bartlett = {
d <- 1/fit$uniquenesses
tmp <- t(Lambda * d)
sc <- t(solve(tmp %*% Lambda, tmp %*% t(zz)))
})
rownames(sc) <- rownames(z)
colnames(sc) <- colnames(Lambda)
if (!is.null(na.act))
sc <- napredict(na.act, sc)
fit$scores <- sc
}
if (!is.na(n.obs) && dof > 0) {
fit$STATISTIC <- (n.obs - 1 - (2 * p + 5)/6 - (2 * factors)/3) *
fit$criteria["objective"]
fit$PVAL <- pchisq(fit$STATISTIC, dof, lower.tail = FALSE)
}
fit$n.obs <- n.obs
fit$call <- cl
fit
}
I have a code that was given to me that runs an ARIMA model putting weight on more recent errors, it gives excellent results, much better than simple ARIMA, but i do not understand the methodology behind it. If you can understand whats going on and why and how it works then i would really appreciate it :)
The code that i would like explaining is from the #---Weighting---
suppressMessages(library(lmtest))
suppressMessages(library(tseries))
suppressMessages(library(forecast))
suppressMessages(library(TTR))
#-------------------------------------------------------------------------------
Input.data <- matrix(c("8Q1","8Q2","8Q3","8Q4","9Q1","9Q2","9Q3","9Q4","10Q1","10Q2","10Q3","10Q4","11Q1","11Q2","11Q3","11Q4","12Q1","12Q2","12Q3","12Q4","13Q1","13Q2","13Q3","13Q4","14Q1","14Q2","14Q3",5403.675741,6773.504993,7231.117289,7835.55156,5236.709983,5526.619467,6555.781711,11464.72728,7210.068674,7501.610403,8670.903486,10872.93518,8209.022658,8153.393088,10196.44775,13244.50201,8356.732878,10188.44157,10601.32205,12617.82102,11786.52641,10044.98676,11006.0051,15101.9456,10992.27282,11421.18922,10731.31198),ncol=2,byrow=FALSE)
#-------------------------------------------------------------------------------
# Maximum seasonal differences allowed. For typical series, 0 is recommended.
max.sdiff <- 2
#-------------------------------------------------------------------------------
# Force seasonality
arima.force.seasonality <- "y"
#-------------------------------------------------------------------------------
# The frequency of the data. 1/4 for QUARTERLY, 1/12 for MONTHLY
Frequency <- 1/4
#-------------------------------------------------------------------------------
# How many quarters/months to forecast
Forecast.horizon <- 4
#-------------------------------------------------------------------------------
# The first date in the series. Use c(8, 1) to denote 2008 q1
Start.date <- c(8, 1)
#-------------------------------------------------------------------------------
# The dates of the forecasts
Forecast.dates <- c("14Q4", "15Q1", "15Q2", "15Q3")
#-------------------------------------------------------------------------------
# Set if the data should be logged. Takes value "s" (lets script choose logging)
#"level" (forces levels) or "log" (forces logs)
force.log <- "s"
#-------------------------------------------------------------------------------
# Selects the data column from Input.data
Data.col <- as.numeric(Input.data[, length(Input.data[1, ])])
#-------------------------------------------------------------------------------
# Turns the Data.col into a time-series
Data.col.ts <- ts(Data.col, deltat=Frequency, start = Start.date)
#-------------------------------------------------------------------------------
# A character vector of the dates from Input.data
Dates.col <- as.character(Input.data[,1])
#-------------------------------------------------------------------------------
# Starts the testing to see if the data should be logged
transform.method <- round(BoxCox.lambda(Data.col.ts, method = "loglik"), 5)
log.values <- seq(0, 0.24999, by = 0.00001)
sqrt.values <- seq(0.25, 0.74999, by = 0.00001)
which.transform.log <- transform.method %in% log.values
which.transform.sqrt <- transform.method %in% sqrt.values
if (which.transform.log == "TRUE"){
as.log <- "log"
Data.new <- log(Data.col.ts)
} else {
if (which.transform.sqrt == "TRUE"){
as.log <- "sqrt"
Data.new <- sqrt(Data.col.ts)
} else {
as.log <- "no"
Data.new <- Data.col.ts
}
}
#----- Weighting ---------------------------------------------------------------
fweight <- function(x){
PatX <- 0.5+x
return(PatX)
}
integ1 <- integrate(fweight, lower = 0.00, upper = 1)
valinteg <- 2*integ1$value
#Split the integral to several intervals, and pick the weights accordingly
integvals <- rep(0, length.out = length(Data.new))
for (i in 1:length(Data.new)){
integi <- integrate(fweight, lower = (i-1)/length(Data.new), upper= i/length(Data.new))
integvals[i] <- 2*integi$value
}
suppressWarnings(kpssW <- kpss.test(Data.new, null="Level"))
suppressWarnings(ppW <- tryCatch({
ppW <- pp.test(Data.new, alternative = "stationary")},
error = function(ppW){
ppW <- list(error = "TRUE", p.value = 0.99)
}))
suppressWarnings(adfW <- adf.test(Data.new, alternative = "stationary",
k = trunc((length(Data.new) - 1)^(1/3))))
suppressWarnings(if (kpssW$p.value < 0.05 |
ppW$p.value > 0.05 |
adfW$p.value > 0.05){
ndiffsW = 1
} else {
ndiffsW = 0
})
aaw <- auto.arima(Data.new,
max.D = max.sdiff,
d = ndiffsW,
seasonal = TRUE,
allowdrift = FALSE,
stepwise = FALSE,
trace = FALSE,
seasonal.test = "ch")
order.arima <- c(aaw$arma[1], aaw$arma[6] , aaw$arma[2])
order.seasonal.arima <- c(aaw$arma[3], aaw$arma[7], aaw$arma[4])
if (sum(aaw$arma[1:2]) == 0){
order.arima[1] <- 1
} else {
NULL
}
if (arima.force.seasonality == "y"){
if(sum(aaw$arma[3:4]) == 0){
order.seasonal.arima[1] <- 1
} else {
NULL
}
} else {
NULL
}
#----- ARIMA -------------------------------------------------------------------
# Fits an ARIMA model with the orders set
stAW <- Arima(Data.new,
order = order.arima,
seasonal = list(order = order.seasonal.arima),
method ="ML")
parSW <- stAW$coef
WMAEOPT <- function(parSW){
ArimaW <- Arima(Data.new,
order = order.arima,
seasonal = list(order = order.seasonal.arima),
include.drift = FALSE,
method = "ML",
fixed = c(parSW))
errAR <- c(abs(resid(ArimaW)))
WMAE <- t(errAR) %*% integvals
return(WMAE)
}
OPTWMAE <- optim(parSW,
WMAEOPT,
method = "SANN",
set.seed(2),
control = list(fnscale = 1, maxit = 5000))
parS3 <- OPTWMAE$par
Arima.Data.new <- Arima(Data.new, order = order.arima, seasonal=list(order=order.seasonal.arima),
include.drift=FALSE, method = "ML", fixed = c(parS3))
Say you have a model object of class 'varrest' returned from a VAR() regression operation.
I want to save the model to a file, but not all data which was used to estimate the coefficients.
How can one just save the model specification wihtout the training data?
Because when I save the model it has a file size of over 1GB and therefore loading does take its time.
Can one save objects without some attributes?
The predict.varest function starts out with this code:
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
You can then investigate how much pruning you might achieve:
data(Canada)
tcan <-
VAR(Canada, p = 2, type = "trend")
names(tcan)
# [1] "varresult" "datamat" "y" "type" "p"
# [6] "K" "obs" "totobs" "restrictions" "call"
object.size(tcan[c("K","p", "obs", "type", "datamat", "y")] )
#15080 bytes
object.size(tcan)
#252032 bytes
So the difference is substantial, but just saving those items is not sufficient because the next line in predict.varest is:
B <- Bcoef(object)
You will need to add that object to the list above and then construct a new predict-function that accepts something less than the large 'varresult' node of the model object. Also turned out that there was a downstream call to an internal function that needs to be stored. (You will need to decide in advance what interval you need for prediction.)
tsmall <- c( tcan[c("K","p", "obs", "type", "datamat", "y", "call")] )
tsmall[["Bco"]] <- Bcoef(tcan)
tsmall$sig.y <- vars:::.fecov(x = tcan, n.ahead = 10)
And the modified predict function will be:
sm.predict <- function (object, ..., n.ahead = 10, ci = 0.95, dumvar = NULL)
{
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
n.ahead <- as.integer(n.ahead)
Z <- object$datamat[, -c(1:K)]
# This used to be a call to Bcoef(object)
B <- object$Bco
if (type == "const") {
Zdet <- matrix(rep(1, n.ahead), nrow = n.ahead, ncol = 1)
colnames(Zdet) <- "const"
}
else if (type == "trend") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(seq(trdstart, length = n.ahead), nrow = n.ahead,
ncol = 1)
colnames(Zdet) <- "trend"
}
else if (type == "both") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(c(rep(1, n.ahead), seq(trdstart, length = n.ahead)),
nrow = n.ahead, ncol = 2)
colnames(Zdet) <- c("const", "trend")
}
else if (type == "none") {
Zdet <- NULL
}
if (!is.null(eval(object$call$season))) {
season <- eval(object$call$season)
seas.names <- paste("sd", 1:(season - 1), sep = "")
cycle <- tail(data.all[, seas.names], season)
seasonal <- as.matrix(cycle, nrow = season, ncol = season -
1)
if (nrow(seasonal) >= n.ahead) {
seasonal <- as.matrix(cycle[1:n.ahead, ], nrow = n.ahead,
ncol = season - 1)
}
else {
while (nrow(seasonal) < n.ahead) {
seasonal <- rbind(seasonal, cycle)
}
seasonal <- seasonal[1:n.ahead, ]
}
rownames(seasonal) <- seq(nrow(data.all) + 1, length = n.ahead)
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, seasonal))
}
else {
Zdet <- as.matrix(seasonal)
}
}
if (!is.null(eval(object$call$exogen))) {
if (is.null(dumvar)) {
stop("\nNo matrix for dumvar supplied, but object varest contains exogenous variables.\n")
}
if (!all(colnames(dumvar) %in% colnames(data.all))) {
stop("\nColumn names of dumvar do not coincide with exogen.\n")
}
if (!identical(nrow(dumvar), n.ahead)) {
stop("\nRow number of dumvar is unequal to n.ahead.\n")
}
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, dumvar))
}
else {
Zdet <- as.matrix(dumvar)
}
}
Zy <- as.matrix(object$datamat[, 1:(K * (p + 1))])
yse <- matrix(NA, nrow = n.ahead, ncol = K)
# This used to be a call to vars:::.fecov
sig.y <- object$sig.y
for (i in 1:n.ahead) {
yse[i, ] <- sqrt(diag(sig.y[, , i]))
}
yse <- -1 * qnorm((1 - ci)/2) * yse
colnames(yse) <- paste(ci, "of", ynames)
forecast <- matrix(NA, ncol = K, nrow = n.ahead)
lasty <- c(Zy[nrow(Zy), ])
for (i in 1:n.ahead) {
lasty <- lasty[1:(K * p)]; print(lasty); print(B)
Z <- c(lasty, Zdet[i, ]) ;print(Z)
forecast[i, ] <- B %*% Z
temp <- forecast[i, ]
lasty <- c(temp, lasty)
}
colnames(forecast) <- paste(ynames, ".fcst", sep = "")
lower <- forecast - yse
colnames(lower) <- paste(ynames, ".lower", sep = "")
upper <- forecast + yse
colnames(upper) <- paste(ynames, ".upper", sep = "")
forecasts <- list()
for (i in 1:K) {
forecasts[[i]] <- cbind(forecast[, i], lower[, i], upper[,
i], yse[, i])
colnames(forecasts[[i]]) <- c("fcst", "lower", "upper",
"CI")
}
names(forecasts) <- ynames
result <- list(fcst = forecasts, endog = object$y, model = object,
exo.fcst = dumvar)
class(result) <- "varprd"
return(result)
}
Either
set the attributes you do not want to NULL, or
copy the parts you want to a new object, or
call the save() function with proper indexing.