I'm trying to create a loop (performed 10 times) to get repeated cross-validation to evaluate the predictive performance of 4 models, and then I have to compute an average of my performances. I must admit that I am new to R and I am struggling with this simple task. I started by creating my formula to cross-validate my models.
loss.mse <- function(fit, df, y, transf){
y_pred <- transf(predict(fit, df))
out <- (y - y_pred)^2
return(mean(out))
}
loss.mae <- function(fit, df, y, transf){
y_pred <- transf(predict(fit, df))
out <- abs(y - y_pred)
return(mean(out))
}
validate.cv <- function(data, folds, model_fn, y_var,
transf = identity, seed)
{
set.seed(seed)
fold_id <- sample(rep(1:folds, length.out = nrow(data)))
out.mse <- out.mae <- numeric(folds)
for(test in 1:folds){
data_test <- subset(data, fold_id == test)
data_train <- subset(data, fold_id != test)
fit <- model_fn(data_train)
y_test <- y_var[fold_id == test]
out.mse[test] <- loss.mse(fit, data_test, y_test, transf)
out.mae[test] <- loss.mae(fit, data_test, y_test, transf)
}
return(list(MAE = mean(out.mae), MSE = out.mse,
RMSE = sqrt(mean(out.mse))))
Then I named my models and I cross-validated them but I can't figure out how to get my 10 times loop !
model_lm <- function(data) lm(StockPrice ~., data)
model_step <- function(data) step(lm(StockPrice ~., data), trace = 0)
model_rpart <- function(data)
{
set.seed(1234)
mod.rpart <- rpart(StockPrice ~ ., data, cp = 0.0001, model = TRUE)
cp.select <- function(big.tree) {
min.x <- which.min(big.tree$cptable[, 4])
for(i in 1:nrow(big.tree$cptable)) {
if(big.tree$cptable[i, 4] <
(big.tree$cptable[min.x, 4] + big.tree$cptable[min.x, 5]))
return(big.tree$cptable[i, 1])
}
}
mod.rpart.prune <- prune(mod.rpart, cp = cp.select(mod.rpart))
return(mod.rpart.prune)
}
model_step_gam <- function(data)
{
mod <- model_step(data)
predictors <- all.vars(terms(mod))[-1]
f <- as.formula(
paste("StockPrice",
paste(paste("s(",predictors, ")"), collapse = " + "),
sep = " ~ "))
mod_gam <- gam(f, data = data)
seed<-1234
m.log.full <-validate.cv(log.Finance, 10, model_lm, Finance$StockPrice,exp, seed)
m.log.step <-validate.cv(log.Finance, 10, model_step, Finance$StockPrice,exp, seed)
m.log.rpart <-validate.cv(log.Finance, 10, model_rpart, Finance$StockPrice,exp, seed)
m.log.gam <-validate.cv(log.Finance, 10, model_step_gam, Finance$StockPrice,exp, seed)
mat.test <-data.frame(Model =c("Full (log)","Step (log)","CART (log)", "Step GAM (log)"),
RMSE =c(m.log.full$RMSE, m.log.step$RMSE,m.log.rpart$RMSE, m.log.gam$RMSE),
MAE =c(m.log.full$MAE, m.log.step$MAE, m.log.rpart$MAE,m.log.gam$MAE))
print(mat.test)
If you have any ideas I would gladly try them. Thank you in advance for your help :)
Related
I am trying to use cross validation with a decision tree using AUC. These are the functions that I am using:
.cvFolds <- function(Y, V) {
Y0 <- split(sample(which(Y == 0)), rep(1:V, length = length(which(Y == 0))))
Y1 <- split(sample(which(Y == 1)), rep(1:V, length = length(which(Y == 1))))
folds <- vector("list", length = V)
for (v in seq(V)) folds[[v]] <- c(Y0[[v]], Y1[[v]])
return(folds)
}
.doFit <- function(V, folds, train) {
set.seed(v)
ycol <- which(names(train) == y)
params <- list(x = train[-folds[[V]], -ycol],
y = as.factor(train[-folds[[V]], ycol]),
xtest = train[folds[[V]], -ycol])
fit <- do.call(randomForest, params)
pred <- fit$test$votes[, 2]
return(pred)
}
This is the function to calculate probabilities:
iid_example <- function(train, y = "V1", V = 10, seed = 1) {
set.seed(seed)
folds <- .cvFolds(Y = train[, c(y)], V = V)
# Generate CV predicted values
cl <- makeCluster(detectCores())
registerDoParallel(cl)
predictions <- foreach(v = 1:V, .combine = "c",
.packages = c("randomForest")) %dopar% .doFit(v, folds, train)
stopCluster(cl)
predictions[unlist(folds)] <- predictions
# Get CV AUC
runtime <- system.time(res <- ci.cvAUC(predictions = predictions,
labels = train[, c(y)],
folds = folds,
confidence = 0.95))
print(runtime)
return(res)
}
The actual function call:
res <- iid_example(train = datos, y = "V1", V = 10, seed = 1)
When I try to run it, I get the following error:
Y0[[v]] out of bounds
I am trying to adjust the parameterization of the function, but I do not understand why it is out of boundaries. Thanks for your help
I have performed the following cross-validation knn (using the caret package) on the iris dataset. I am now trying to plot the training and test error rates for the result. Here is my attempt but I cannot get the error rates. Can anyone help me please?
library(caret)
data(iris)
sample <- sample(2, nrow(iris), replace=TRUE, prob=c(0.80, 0.20))
iris.training <- iris[sample == 1, 1:4]
iris.test <- iris[sample == 2, 1:4]
iris.trainLabels <- iris[sample == 1, 5]
iris.testLabels <- iris[sample == 2, 5]
# Combine training data and combine test data.
iris_train <- cbind(iris.trainLabels, iris.training)
iris_test <- cbind(iris.testLabels, iris.test)
trControl <- trainControl(method = "cv", number = 5)
# K values 1 3 5 7 9
k_values <- seq(from=1, to=10, by=2)
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
# Plot
bestK <- function(iris_train, iris.trainLabels,
iris.testLabels) {
ctr <- c(); cts <- c()
for (k in length(k_values)) {
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
trTable <- prop.table(table(fit, iris.trainLabels))
tsTable <- prop.table(table(fit, iris.testLabels))
erTr <- trTable[1,2] + trTable[2,1]
erTs <- tsTable[1,2] + tsTable[2,1]
ctr <- c(ctr,erTr)
cts <- c(cts,erTs)
}
err <- data.frame(k=k_values, trER=ctr, tsER=cts)
return(err)
}
err <- bestK(iris_train, iris.trainLabels, iris.testLabels)
plot(err$k,err$trER,type='o',ylim=c(0,.5),xlab="k",ylab="Error rate",col="blue")
lines(err$k,err$tsER,type='o',col="red")
Update:
Would like to obtain a visual plot something similar to this...
I have set up the following function:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '', collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
According to this tutorial (https://topepo.github.io/caret/parallel-processing.html) I can parallelize my code just by calling library(doParallel); cl <- makePSOCKcluster(2); registerDoParallel(cl).
When I then use the function with doParallel
predictors_name <- c("Time", "Chick")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:500,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
the error message couldn't find function "MAPE" occurs.
How can I fix this without using the foreach syntax?
If I specify the package while calling the function like package::function, then it is working. Maybe there is a more elegant solution, but this is how I made the code running without an error:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- caret::train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '',
collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
predictors_name <- c("Time", "Chick", "Diet")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:578,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
Throughout my function I have arguments z and y
I want z to be equal to a data set (for example birthwt) and y to be equal to a response variable (for example birthwt$low)
library("MASS")
library("dplyr")
data(birthwt)
foo=function(z,y){
n.folds <- 10
folds <- cut(sample(seq_len(nrow(z))), breaks=n.folds, labels=FALSE)
all.confusion.tables <- list()
for (i in seq_len(n.folds)) {
train <- filter(z, folds != i)
test <- filter(z, folds == i)
glm.train <- glm(y ~.,family = binomial, data = train)
mod_pred_probs =predict(glm.train,test, type= "response")
pred.class <- ifelse(mod_pred_probs< 0, 0, 1)
all.confusion.tables[[i]] <- table(pred = pred.class, true = test$y)
}
misclassrisk <- function(x) { (sum(x) - sum(diag(x)))/sum(x) }
risk <- sapply(all.confusion.tables, misclassrisk)
return(table(risk))
mean(risk)}
When I run foo(birtht,"low")
I get the error:
Error in model.frame.default(formula = y ~ ., data = train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'low')
Does any one know why I am getting the error or how I can avoid it?
I have tried to reproduce the results from the answers for this question “Estimating random effects and applying user defined correlation/covariance structure with R lme4 or nlme package “ https://stats.stackexchange.com/questions/18563/estimating-random-effects-and-applying-user-defined-correlation-covariance-struc
Aaron Rendahl's codes
library(pedigreemm)
relmatmm <- function (formula, data, family = NULL, REML = TRUE, relmat = list(),
control = list(), start = NULL, verbose = FALSE, subset,
weights, na.action, offset, contrasts = NULL, model = TRUE,
x = TRUE, ...)
{
mc <- match.call()
lmerc <- mc
lmerc[[1]] <- as.name("lmer")
lmerc$relmat <- NULL
if (!length(relmat))
return(eval.parent(lmerc))
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
lmerc$doFit <- FALSE
lmf <- eval(lmerc, parent.frame())
relfac <- relmat
relnms <- names(relmat)
stopifnot(all(relnms %in% names(lmf$FL$fl)))
asgn <- attr(lmf$FL$fl, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(lmf$FL$fl)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be associated with only one random effects term")
Zt <- lmf$FL$trms[[tn]]$Zt
relmat[[i]] <- Matrix(relmat[[i]][rownames(Zt), rownames(Zt)],
sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
lmf$FL$trms[[tn]]$Zt <- lmf$FL$trms[[tn]]$A <- relfac[[i]] %*% Zt
}
ans <- do.call(if (!is.null(lmf$glmFit))
lme4:::glmer_finalize
else lme4:::lmer_finalize, lmf)
ans <- new("pedigreemm", relfac = relfac, ans)
ans#call <- match.call()
ans
}
the original example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat), data=mydata)
here is the error message
Error in lmf$FL : $ operator not defined for this S4 class
In addition: Warning message:
In checkArgs("lmer", doFit = FALSE) : extra argument(s) ‘doFit’ disregarded
I will appreciate any help ?
Thanks
This is a re-implementation of the previous code -- I have done some slight modifications, and I have not tested it in any way -- test yourself and/or use at your own risk.
First create a slightly more modularized function that constructs the deviance function and fits the model:
doFit <- function(lmod,lmm=TRUE) {
## see ?modular
if (lmm) {
devfun <- do.call(mkLmerDevfun, lmod)
opt <- optimizeLmer(devfun)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
} else {
devfun <- do.call(mkGlmerDevfun, lmod)
opt <- optimizeGlmer(devfun)
devfun <- updateGlmerDevfun(devfun, lmod$reTrms)
opt <- optimizeGlmer(devfun, stage=2)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
}
}
Now create a function to construct the object that doFit needs and modify it:
relmatmm <- function (formula, ..., lmm=TRUE, relmat = list()) {
ff <- if (lmm) lFormula(formula, ...) else glFormula(formula, ...)
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
relnms <- names(relmat)
relfac <- relmat
flist <- ff$reTrms[["flist"]] ## list of factors
## random-effects design matrix components
Ztlist <- ff$reTrms[["Ztlist"]]
stopifnot(all(relnms %in% names(flist)))
asgn <- attr(flist, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(flist)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be",
" associated with only one random effects term")
zn <- rownames(Ztlist[[i]])
relmat[[i]] <- Matrix(relmat[[i]][zn,zn],sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
Ztlist[[i]] <- relfac[[i]] %*% Ztlist[[i]]
}
ff$reTrms[["Ztlist"]] <- Ztlist
ff$reTrms[["Zt"]] <- do.call(rBind,Ztlist)
fit <- doFit(ff,lmm)
}
Example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat),
data=mydata)
This runs -- I don't know if the output is correct. It also doesn't make the resulting object into a pedigreemm object ...