Looping a function in R - r

I have written a cross validation/grid search style code in R that tries to find an optimal threshold value for a given value of mtry (using the random forest algorithm). I have posted my code below using the Sonar data from the library mlbench However, there seems to be some problems with this code.
library(caret)
library(mlbench)
library(randomForest)
res <- matrix(0, nrow = 10, ncol = 6)
colnames(res) <- c("mtry","Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 17, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
rep <- matrix(0, nrow = 10, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg_Accuracy", "Avg_PosPred", "Avg_NegPred", "Avg_F_Value")
data(Sonar)
N=Sonar
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for (mtry in 5:14) {
K=mtry-4
for(thresh in seq(1,9,0.5)) {
J = 2*thresh-1
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Class~., data = N_train, mtry=mtry, ntree=500)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
confusion = confusionMatrix(N_test$Class, label)
res[I,1]=mtry
res[I,2]=thresh
res[I,3]=confusion$overall[1]
res[I,4]=confusion$byClass[3]
res[I,5]=confusion$byClass[4]
res[I,6]=confusion$byClass[7]
}
print(res)
out[J,1] = mtry
out[J,2] = thresh
out[J,3] = mean(res[,2])
out[J,4] = mean(res[,3])
out[J,5] = mean(res[,4])
out[J,6] = mean(res[,5])
}
print(out)
rep[K,1] = mtry
rep[K,2] = thresh
rep[K,3] = mean(out[,2])
rep[K,4] = mean(out[,3])
rep[K,5] = mean(out[,4])
rep[K,6] = mean(out[,5])
}
print(rep)
Earlier, I wrote a similar code with the "iris" dataset, and I did not seem to have any problems:
library(caret)
library(randomForest)
data(iris)
N <- iris
N$Species = ifelse(N$Species == "setosa", "a", "b")
N$Species = as.factor(N$Species)
res <- matrix(0, nrow = 10, ncol = 5)
colnames(res) <- c("Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 9, ncol = 5)
colnames(out) <- c("Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for(J in 1:9) {
thresh = J/10
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Species~., data = N_train, mtry=3, ntree=10)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,1]>=thresh,"a","b"))
confusion = confusionMatrix(N_test$Species, label)
res[I,1]=thresh
res[I,2]=confusion$overall[1]
res[I,3]=confusion$byClass[3]
res[I,4]=confusion$byClass[4]
res[I,5]=confusion$byClass[7]
}
print(res)
out[J,1] = thresh
out[J,2] = mean(res[,2])
out[J,3] = mean(res[,3])
out[J,4] = mean(res[,4])
out[J,5] = mean(res[,5])
}
print(out)
Could someone please assist me in debugging the first code?
Thanks

You need to close parenthesis ) in your for loop.
Replace this
for(thresh in seq(1,9,0.5) {
with
for(thresh in seq(1,9,0.5)) {
Update:
Also, it appears that your thresh is always above 1 giving a single value R in the label, as it is never above thresh.
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
and that creates a problem in the next statement
confusion = confusionMatrix(N_test$Class, label)
I tested with 0.5, and I get no error.
label = as.factor(ifelse(pred[,2]>=0.5,"M","R"))
If you can define a better thresh - to stay between 0 and 1, you should be fine.

Related

Parallelizing codes for efficiency in R

I am trying a variable screening using the SIS package in R using different tunings and penalties. I have for loops which will take long for relatively large data. I am trying to parallelize this piece of code for efficiency. But I am running into some errors.
Please kindly help if you can. Thanks for your time and help.
#load library
library(parallel)
library(doParallel)
library(foreach)
library(SIS)
library(dplyr)
data('leukemia.train', package = 'SIS') #data for practice
y.train = leukemia.train[,dim(leukemia.train)[2]]
x.train = as.matrix(leukemia.train[,-dim(leukemia.train)[2]])
x.train = standardize(x.train)
#penalties for screening
penalty <- c("lasso", "SCAD", "MCP")
#storeage
RESULT <- NULL
alldat <- NULL
for(pen in penalty){
#tuning para
tune <- c("aic", "bic", "ebic", "cv")
#storage
OUT <- NULL
dat <- NULL
for(tun in tune){
#SIS model for ultra-high dimensional screening
mod=SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pen, tune = tun, varISIS = 'aggr', seed = 21) #model
out <- mod$ix
coff <- mod$coef.est
x <- x.train %>% as.data.frame()
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- coff %>% as.data.frame()
OUT[[tun]] <- cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
names(OUT[tun]) <- paste(tun)
dat[[tun]] <- dat0
#store as list for cases
names(dat[tun]) <- paste(tun)
}
#list of all results of coef
RESULT[[pen]] <- OUT
dat #list of data sets
alldat[[pen]] <-
names(RESULT[pen]) <- paste(pen)
names(alldat[pen]) <- paste(pen)
}
#parallelize here
pentune.df <- expand.grid(
tune = c("aic", "bic", "ebic", "cv"),
penalty = c("lasso", "SCAD", "MCP")
)# use expand for to obtain possible combinations
#create and register cluster
n.cores <- parallel::detectCores() - 2
my.cluster <- parallel::makeCluster(n.cores)
doParallel::registerDoParallel(cl = my.cluster)
foreach(
tun = pentune.df$tun,
pena = pentune.df$pena,
.combine = 'list',
.packages = "SIS"
) %dopar% {
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
OUT <- return(cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL)))
}
parallel::stopCluster(cl = my.cluster) #end job
normally it is best if you can narrow in on the error that you are getting it makes it easier to help. The main issue seemed to be simplifying your iterator within the foreach and ensuring the penalty and tune variables for SIS
are character. The expand.grid function is exactly what you need but the resulting columns are factors. So these need to be converted back when inserting into the SIS function.
Finally, in your last line of the %dopar% {} don't define a variable and you don't need to return. The last object returns automatically. So you can remove OUT <- return().
I have added some comments in the code below to indicate exactly what I have changed.
foreach(
i = 1:nrow(pentune.df), # define a simpler iterator
.combine = 'list',
.packages = "SIS"
) %dopar% {
# define loop variables and ensure they are character
pena <- as.character(pentune.df[i, 'penalty'])
tun <- as.character(pentune.df[i, 'tune'])
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
# don't define a variable here just create the object you want
cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
}

R Caret: seeds and createMultiFolds

I want to make my code reproducible and use the seeds argument as well as createMultiFolds within a loop.
I set up this code:
cv_model <- function(dat, targets){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
results <<- list(weight = NA, vari = NA)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
c(MAPE = mape, RMSE = RMSE)
}
for (i in 1:length(targets)) {
set.seed(43)
folds <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 3)
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=3, repeats=3, search="grid",
savePred =T,
summaryFunction = sumfct, index = folds, seeds = myseeds)
# fixed mtry
params <- data.frame(mtry = 2)
# choose predictor columns by excluding target columns
preds <- dat[, -c(which(names(dat) == "Time"),
which(names(dat) == "Chick"),
which(names(dat) == "Diet"))]
# set target variables
response <- dat[, which(names(dat) == targets[i])]
set.seed(42)
model <- caret::train(x = preds,
y = response,
data = dat,
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=params,
trControl=control)
results[[i]] <<- model
}
}
targets <- c("weight", "vari")
dat <- as.data.frame(ChickWeight)
# generate random numbers
set.seed(1)
dat$vari <- c(runif(nrow(dat)))
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
# use function
cv_model(dat = dat, targets = targets)
# end parallel computing
stopCluster(cl)
# unregister doParallel by registering DoSeq (do sequential)
registerDoSEQ()
After running the code, the error message Error: Please make sure 'y' is a factor or numeric value.. occurs.
If you delete the following lines
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
and within trainControl , seeds = myseeds, then the code runs without an error message.
How can I fix the error and at the same time provide seeds and createMultiFolds within the code?

Multi asset portfolio optimization R

I am new to Portfolio Optimization in R. When I add more than 25 assets (each asset has about 25 observations) to my portfolio, optimize.portfolio does not find any solutions. When I run this program with 25 assets or less, it works properly and plots the efficient frontier. Any help on this is much appreciated.
library(data.table)
library(readxl)
library(PerformanceAnalytics)
library(PortfolioAnalytics)
library(ROI)
library(foreach)
library(DEoptim)
library(iterators)
library(fGarch)
library(Rglpk)
library(quadprog)
library(ROI.plugin.glpk)
library(ROI.plugin.quadprog)
library(ROI.plugin.symphony)
library(pso)
library(GenSA)
library(corpcor)
library(testthat)
library(nloptr)
library(MASS)
library(robustbase)
library(ggplot2)
setwd("~/R")
#BRING IN DATA
returns.data <- read_excel("portfolio_sample_4asset.xlsx", sheet = "portfolio")
returns.data <- data.frame(returns.data)
row.names(returns.data) <- (returns.data$year)
returns.data$year <- NULL
meanReturns <- colMeans(returns.data)
#GENERATE COVARIANCE AND CORRELATION TABLES
cov.pop <- function(x,y=NULL) {
cov(x,y)*(NROW(x)-1)/NROW(x)
}
covMat <- cov.pop(returns.data)
corMat <- cor(returns.data)
#SPECIFY PORTFOLIO OBJECT
port <- portfolio.spec(assets = colnames(returns.data))
#CONSTRAINTS
port <- add.constraint(port,type="weight_sum",min=0.99, max=1.01)
#rportfolios <- random_portfolios(port, permutations = 500, rp_method = "sample", eliminate = TRUE)
#OPTIMIZATION SETUP
minreturnLimit <- min(colMeans(returns.data))
maxreturnLimit <- max(colMeans(returns.data))
minret <- minreturnLimit
maxret <- maxreturnLimit
vec <- seq(minret, maxret, length.out = 100)
eff.frontier <- data.frame(Risk = rep(NA, length(vec)), Return = rep(NA, length(vec)))
frontier.weights <- mat.or.vec(nr = length(vec), nc = ncol(returns.data))
colnames(frontier.weights) <- colnames(returns.data)
#GENERATE EFFICIENT FRONTIER
#In add.constraint...the type is return, as in, it is targeting a specific return specified by vec
#Subsequently, it looks for the portfolio that minimizes StdDev for that return constraint...this is the objective
for(i in 1:length(vec)){
eff.port <- add.constraint(port, type = "return", name = "mean", return_target = vec[i])
eff.port <- add.objective(eff.port, type = "risk", name = "var")
# eff.port <- add.objective(eff.port, type = "weight_concentration", name = "HHI",
# conc_aversion = 0.001)
eff.port <- optimize.portfolio(returns.data, eff.port, optimize_method = "ROI")
eff.frontier$Risk[i] <- sqrt(t(eff.port$weights) %*% covMat %*% eff.port$weights)
eff.frontier$Return[i] <- eff.port$weights %*% meanReturns
eff.frontier$Sharperatio[i] <- eff.port$Return[i] / eff.port$Risk[i]
frontier.weights[i,] = eff.port$weights
print(paste(round(i/length(vec) * 100, 0), "% done..."))
}
#PLOT EFFICIENT FRONTIER
ggplot(eff.frontier, aes(x=eff.frontier$Risk, y=eff.frontier$Return)) + geom_point(shape = 18, color = "limegreen", size = 2) + ggtitle("Portfolio Optimization") + labs(x="Risk",y="Return")
eff.frontier$Sharpe <- eff.frontier$Return / eff.frontier$Risk
I figured out why my optimization wasn't working - my dataset contained a rank-deficient matrix which could not be solved (more columns than rows).

caret: "Some row.names duplicated" warning when using RFE

I am building a toy dataset based on the linear problem from page 5 from this paper in order to test feature selection using caret's RFE+SVM with rbf kernel. However, when RFE finishes, I get a warning per bootstrap iteration with the following message: "In data.row.names(row.names, rowsi, i) : some row.names duplicated:" followed by many row numbers, until the output is truncated.
Is this caused because the bootstrap may be selecting samples with replacement and therefore duplicating rows in the bootstrapped data, or is there something else wrong with this? Any advice appreciated (please forgive the lazy implementation of the artificial dataset).
library(caret)
################
# 1. Building dataset
################
set.seed(1)
n.samples <- 500
y <- round(runif(n = n.samples, min=0, max=1))
data <- matrix(nrow=n.samples, ncol=202)
for(i in 1:n.samples){
toss <- runif(n=1, min=0, max=1)
if(toss <= 0.7) {
for(j in 1:3){
data[i,j] <- y[i]*rnorm(n = 1, mean = i, sd = 1)
}
for(j in 4:6){
data[i,j] <- rnorm(n = 1, mean = 0, sd = 1)
}
} else {
for(j in 1:3){
data[i,j] <- rnorm(n=1, mean=0, sd=1)
}
for(j in 4:6){
data[i,j] <- y[i]*rnorm(n=1, mean=i-3, sd = 1)
}
}
for(j in 7:202){
data[i,j] <- rnorm(n = 1, mean = 0, sd = 20)
}
}
colnames(data) <- c(paste("s", 1:6, sep = ""), paste('ns', 7:202, sep=''))
rownames(data) <- paste('sample', 1:n.samples, sep='')
################
# 2. Perform SVM - RFE
################
set.seed(1)
rfe.control.settings <- rfeControl(functions = caretFuncs,
method = 'boot',
number = 30,
verbose = TRUE)
svm.fit <- rfe(x=data,
y=y,
sizes=c(1,2,3,4),
rfeControl = rfe.control.settings,
method = 'svmRadial') #passing options to train / caretFuncs
I was facing the same problem, and what fixed it for me is changing the data class from matrix to data.frame.

Reproducing results from previous answer is not working due to using new version of lme4

I have tried to reproduce the results from the answers for this question “Estimating random effects and applying user defined correlation/covariance structure with R lme4 or nlme package “ https://stats.stackexchange.com/questions/18563/estimating-random-effects-and-applying-user-defined-correlation-covariance-struc
Aaron Rendahl's codes
library(pedigreemm)
relmatmm <- function (formula, data, family = NULL, REML = TRUE, relmat = list(),
control = list(), start = NULL, verbose = FALSE, subset,
weights, na.action, offset, contrasts = NULL, model = TRUE,
x = TRUE, ...)
{
mc <- match.call()
lmerc <- mc
lmerc[[1]] <- as.name("lmer")
lmerc$relmat <- NULL
if (!length(relmat))
return(eval.parent(lmerc))
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
lmerc$doFit <- FALSE
lmf <- eval(lmerc, parent.frame())
relfac <- relmat
relnms <- names(relmat)
stopifnot(all(relnms %in% names(lmf$FL$fl)))
asgn <- attr(lmf$FL$fl, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(lmf$FL$fl)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be associated with only one random effects term")
Zt <- lmf$FL$trms[[tn]]$Zt
relmat[[i]] <- Matrix(relmat[[i]][rownames(Zt), rownames(Zt)],
sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
lmf$FL$trms[[tn]]$Zt <- lmf$FL$trms[[tn]]$A <- relfac[[i]] %*% Zt
}
ans <- do.call(if (!is.null(lmf$glmFit))
lme4:::glmer_finalize
else lme4:::lmer_finalize, lmf)
ans <- new("pedigreemm", relfac = relfac, ans)
ans#call <- match.call()
ans
}
the original example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat), data=mydata)
here is the error message
Error in lmf$FL : $ operator not defined for this S4 class
In addition: Warning message:
In checkArgs("lmer", doFit = FALSE) : extra argument(s) ‘doFit’ disregarded
I will appreciate any help ?
Thanks
This is a re-implementation of the previous code -- I have done some slight modifications, and I have not tested it in any way -- test yourself and/or use at your own risk.
First create a slightly more modularized function that constructs the deviance function and fits the model:
doFit <- function(lmod,lmm=TRUE) {
## see ?modular
if (lmm) {
devfun <- do.call(mkLmerDevfun, lmod)
opt <- optimizeLmer(devfun)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
} else {
devfun <- do.call(mkGlmerDevfun, lmod)
opt <- optimizeGlmer(devfun)
devfun <- updateGlmerDevfun(devfun, lmod$reTrms)
opt <- optimizeGlmer(devfun, stage=2)
mkMerMod(environment(devfun), opt, lmod$reTrms, fr = lmod$fr)
}
}
Now create a function to construct the object that doFit needs and modify it:
relmatmm <- function (formula, ..., lmm=TRUE, relmat = list()) {
ff <- if (lmm) lFormula(formula, ...) else glFormula(formula, ...)
stopifnot(is.list(relmat), length(names(relmat)) == length(relmat))
relnms <- names(relmat)
relfac <- relmat
flist <- ff$reTrms[["flist"]] ## list of factors
## random-effects design matrix components
Ztlist <- ff$reTrms[["Ztlist"]]
stopifnot(all(relnms %in% names(flist)))
asgn <- attr(flist, "assign")
for (i in seq_along(relmat)) {
tn <- which(match(relnms[i], names(flist)) == asgn)
if (length(tn) > 1)
stop("a relationship matrix must be",
" associated with only one random effects term")
zn <- rownames(Ztlist[[i]])
relmat[[i]] <- Matrix(relmat[[i]][zn,zn],sparse = TRUE)
relfac[[i]] <- chol(relmat[[i]])
Ztlist[[i]] <- relfac[[i]] %*% Ztlist[[i]]
}
ff$reTrms[["Ztlist"]] <- Ztlist
ff$reTrms[["Zt"]] <- do.call(rBind,Ztlist)
fit <- doFit(ff,lmm)
}
Example
set.seed(1234)
mydata <- data.frame (gen = factor(rep(1:10, each = 10)),
repl = factor(rep(1:10, 10)),
yld = rnorm(10, 5, 0.5))
library(lme4)
covmat <- round(nearPD(matrix(runif(100, 0, 0.2), nrow = 10))$mat, 2)
diag(covmat) <- diag(covmat)/10+1
rownames(covmat) <- colnames(covmat) <- levels(mydata$gen)
m <- relmatmm(yld ~ (1|gen) + (1|repl), relmat=list(gen=covmat),
data=mydata)
This runs -- I don't know if the output is correct. It also doesn't make the resulting object into a pedigreemm object ...

Resources