caret: "Some row.names duplicated" warning when using RFE - r

I am building a toy dataset based on the linear problem from page 5 from this paper in order to test feature selection using caret's RFE+SVM with rbf kernel. However, when RFE finishes, I get a warning per bootstrap iteration with the following message: "In data.row.names(row.names, rowsi, i) : some row.names duplicated:" followed by many row numbers, until the output is truncated.
Is this caused because the bootstrap may be selecting samples with replacement and therefore duplicating rows in the bootstrapped data, or is there something else wrong with this? Any advice appreciated (please forgive the lazy implementation of the artificial dataset).
library(caret)
################
# 1. Building dataset
################
set.seed(1)
n.samples <- 500
y <- round(runif(n = n.samples, min=0, max=1))
data <- matrix(nrow=n.samples, ncol=202)
for(i in 1:n.samples){
toss <- runif(n=1, min=0, max=1)
if(toss <= 0.7) {
for(j in 1:3){
data[i,j] <- y[i]*rnorm(n = 1, mean = i, sd = 1)
}
for(j in 4:6){
data[i,j] <- rnorm(n = 1, mean = 0, sd = 1)
}
} else {
for(j in 1:3){
data[i,j] <- rnorm(n=1, mean=0, sd=1)
}
for(j in 4:6){
data[i,j] <- y[i]*rnorm(n=1, mean=i-3, sd = 1)
}
}
for(j in 7:202){
data[i,j] <- rnorm(n = 1, mean = 0, sd = 20)
}
}
colnames(data) <- c(paste("s", 1:6, sep = ""), paste('ns', 7:202, sep=''))
rownames(data) <- paste('sample', 1:n.samples, sep='')
################
# 2. Perform SVM - RFE
################
set.seed(1)
rfe.control.settings <- rfeControl(functions = caretFuncs,
method = 'boot',
number = 30,
verbose = TRUE)
svm.fit <- rfe(x=data,
y=y,
sizes=c(1,2,3,4),
rfeControl = rfe.control.settings,
method = 'svmRadial') #passing options to train / caretFuncs

I was facing the same problem, and what fixed it for me is changing the data class from matrix to data.frame.

Related

How can I catch fatal error on rpart in R for loop?

I am perofrimn grid search using rpart tree models. On some iteration I got fatal error due to values pass in control argument. Is there an easy way to stop R from crashing if it cannot fit tree in this iteration?
#large_grid
complexity_par_val <- seq(0.001, 0.01, 0.001)
min_bin_val <- seq(500, 5000, 500)
max_depth_val <- seq(1, 30, 1)
freq_tree_large_grid <- expand.grid(cp = complexity_par_val, min_bin = min_bin_val, max_depth = max_depth_val)
#random serach
set.seed(123)
n_search <- 500
sample_for_r_search <- freq_tree_large_grid[sample(nrow(freq_tree_large_grid), n_search), ]
result_of_r_search_freq_old <- result_of_r_search_freq
result_of_r_search_freq <- data.frame()
start_time <- Sys.time()
for(i in 1:n_search) {
cp_1 <- sample_for_r_search$cp[i]
min_bin_1 <- sample_for_r_search$min_bin[i]
max_depth_1 <- sample_for_r_search$max_depth[i]
cntr <- list(cp=cp_1, minbucket = min_bin_1, maxdepth = max_depth_1, xval = 0)
sum_dev <- 0
for (j in 1:8){
FREQ_V <- FREQ_TRAIN[FREQ_TRAIN$ValRandom10 == j,]
FREQ_D <- FREQ_TRAIN[FREQ_TRAIN$ValRandom10 != j,]
tryCatch({
tree <- rpart( formula = formula_tree,
data = FREQ_D,
method = "poisson" ,
control = cntr
)}, error=function(e){})
pred <- predict(tree, newdata = FREQ_V )*FREQ_V$Exposure
Dev <- Deviance_Poisson(pred, FREQ_V$ClaimNb)
sum_dev <- sum_dev+Dev
print('cv')
print(j)
}
CV8_DEV <- sum_dev/8
result_of_r_search_freq <- rbind(result_of_r_search_freq, data.frame(CV8_DEV, cp_1, min_bin_1, max_depth_1))
print('ending the cross validation nr:')
print(i)
}
end_time <- Sys.time()

Looping a function in R

I have written a cross validation/grid search style code in R that tries to find an optimal threshold value for a given value of mtry (using the random forest algorithm). I have posted my code below using the Sonar data from the library mlbench However, there seems to be some problems with this code.
library(caret)
library(mlbench)
library(randomForest)
res <- matrix(0, nrow = 10, ncol = 6)
colnames(res) <- c("mtry","Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 17, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
rep <- matrix(0, nrow = 10, ncol = 6)
colnames(out) <- c("mtry","Threshhold","Avg_Accuracy", "Avg_PosPred", "Avg_NegPred", "Avg_F_Value")
data(Sonar)
N=Sonar
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for (mtry in 5:14) {
K=mtry-4
for(thresh in seq(1,9,0.5)) {
J = 2*thresh-1
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Class~., data = N_train, mtry=mtry, ntree=500)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
confusion = confusionMatrix(N_test$Class, label)
res[I,1]=mtry
res[I,2]=thresh
res[I,3]=confusion$overall[1]
res[I,4]=confusion$byClass[3]
res[I,5]=confusion$byClass[4]
res[I,6]=confusion$byClass[7]
}
print(res)
out[J,1] = mtry
out[J,2] = thresh
out[J,3] = mean(res[,2])
out[J,4] = mean(res[,3])
out[J,5] = mean(res[,4])
out[J,6] = mean(res[,5])
}
print(out)
rep[K,1] = mtry
rep[K,2] = thresh
rep[K,3] = mean(out[,2])
rep[K,4] = mean(out[,3])
rep[K,5] = mean(out[,4])
rep[K,6] = mean(out[,5])
}
print(rep)
Earlier, I wrote a similar code with the "iris" dataset, and I did not seem to have any problems:
library(caret)
library(randomForest)
data(iris)
N <- iris
N$Species = ifelse(N$Species == "setosa", "a", "b")
N$Species = as.factor(N$Species)
res <- matrix(0, nrow = 10, ncol = 5)
colnames(res) <- c("Threshhold","Accuracy", "PositivePred", "NegativePred", "F-value")
out <- matrix(0, nrow = 9, ncol = 5)
colnames(out) <- c("Threshhold","Avg.Accuracy", "Avg.PosPred", "Avg.NegPred", "Avg.F_Value")
### creating 10 folds
folds <- cut(seq(1,nrow(N)),breaks=10,labels=FALSE)
for(J in 1:9) {
thresh = J/10
dataset<-N[sample(nrow(N)),] #### mix up the dataset N
for(I in 1:10){
#Segement your data by fold using the which() function
testIndexes <- which(folds==I,arr.ind=TRUE)
N_test <- dataset[testIndexes, ] ### select each fold for test
N_train <- dataset[-testIndexes, ] ### select rest for training
rf = randomForest(Species~., data = N_train, mtry=3, ntree=10)
pred = predict(rf, N_test, type="prob")
label = as.factor(ifelse(pred[,1]>=thresh,"a","b"))
confusion = confusionMatrix(N_test$Species, label)
res[I,1]=thresh
res[I,2]=confusion$overall[1]
res[I,3]=confusion$byClass[3]
res[I,4]=confusion$byClass[4]
res[I,5]=confusion$byClass[7]
}
print(res)
out[J,1] = thresh
out[J,2] = mean(res[,2])
out[J,3] = mean(res[,3])
out[J,4] = mean(res[,4])
out[J,5] = mean(res[,5])
}
print(out)
Could someone please assist me in debugging the first code?
Thanks
You need to close parenthesis ) in your for loop.
Replace this
for(thresh in seq(1,9,0.5) {
with
for(thresh in seq(1,9,0.5)) {
Update:
Also, it appears that your thresh is always above 1 giving a single value R in the label, as it is never above thresh.
label = as.factor(ifelse(pred[,2]>=thresh,"M","R"))
and that creates a problem in the next statement
confusion = confusionMatrix(N_test$Class, label)
I tested with 0.5, and I get no error.
label = as.factor(ifelse(pred[,2]>=0.5,"M","R"))
If you can define a better thresh - to stay between 0 and 1, you should be fine.

Unable to run foreach in doParallel package

I'm trying to run the following R codes (https://www.r-bloggers.com/general-regression-neural-network-with-r/) to implement a General Regression Neural Network (GRNN) in R. "foreach" function is used (two times) to search for the optimal value of sigma.
pkgs <- c('MASS', 'doParallel', 'foreach', 'grnn')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 8)
data(Boston)
# PRE-PROCESSING DATA
X <- Boston[-14]
st.X <- scale(X)
Y <- Boston[14]
boston <- data.frame(st.X, Y)
# SPLIT DATA SAMPLES
set.seed(2013)
rows <- sample(1:nrow(boston), nrow(boston) - 200)
set1 <- boston[rows, ]
set2 <- boston[-rows, ]
# DEFINE A FUNCTION TO SCORE GRNN
pred_grnn <- function(x, nn){
xlst <- split(x, 1:nrow(x))
pred <- foreach(i = xlst, .combine = rbind) %dopar% {
data.frame(pred = guess(nn, as.matrix(i)), i, row.names = NULL)
}
}
# SEARCH FOR THE OPTIMAL VALUE OF SIGMA BY THE VALIDATION SAMPLE
cv <- foreach(s = seq(0.2, 1, 0.05), .combine = rbind) %dopar% {
grnn <- smooth(learn(set1, variable.column = ncol(set1)), sigma = s)
pred <- pred_grnn(set2[, -ncol(set2)], grnn)
test.sse <- sum((set2[, ncol(set2)] - pred$pred)^2)
data.frame(s, sse = test.sse)
}
cat("\n### SSE FROM VALIDATIONS ###\n")
print(cv)
jpeg('grnn_cv.jpeg', width = 800, height = 400, quality = 100)
with(cv, plot(s, sse, type = 'b'))
cat("\n### BEST SIGMA WITH THE LOWEST SSE ###\n")
print(best.s <- cv[cv$sse == min(cv$sse), 1])
# SCORE THE WHOLE DATASET WITH GRNN
final_grnn <- smooth(learn(set1, variable.column = ncol(set1)), sigma = best.s)
pred_all <- pred_grnn(boston[, -ncol(set2)], final_grnn)
jpeg('grnn_fit.jpeg', width = 800, height = 400, quality = 100)
plot(pred_all$pred, boston$medv)
dev.off()
But the following error occurred after the second "foreach" function (I mean, after cv).
Error in { : task 1 failed - "unused argument (sigma = s)"
any help would be appreciated.

R Caret: seeds and createMultiFolds

I want to make my code reproducible and use the seeds argument as well as createMultiFolds within a loop.
I set up this code:
cv_model <- function(dat, targets){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
results <<- list(weight = NA, vari = NA)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
c(MAPE = mape, RMSE = RMSE)
}
for (i in 1:length(targets)) {
set.seed(43)
folds <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 3)
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=3, repeats=3, search="grid",
savePred =T,
summaryFunction = sumfct, index = folds, seeds = myseeds)
# fixed mtry
params <- data.frame(mtry = 2)
# choose predictor columns by excluding target columns
preds <- dat[, -c(which(names(dat) == "Time"),
which(names(dat) == "Chick"),
which(names(dat) == "Diet"))]
# set target variables
response <- dat[, which(names(dat) == targets[i])]
set.seed(42)
model <- caret::train(x = preds,
y = response,
data = dat,
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=params,
trControl=control)
results[[i]] <<- model
}
}
targets <- c("weight", "vari")
dat <- as.data.frame(ChickWeight)
# generate random numbers
set.seed(1)
dat$vari <- c(runif(nrow(dat)))
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
# use function
cv_model(dat = dat, targets = targets)
# end parallel computing
stopCluster(cl)
# unregister doParallel by registering DoSeq (do sequential)
registerDoSEQ()
After running the code, the error message Error: Please make sure 'y' is a factor or numeric value.. occurs.
If you delete the following lines
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
and within trainControl , seeds = myseeds, then the code runs without an error message.
How can I fix the error and at the same time provide seeds and createMultiFolds within the code?

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Resources