missing values NaN - r

please help me, there's an error when i'm trying to make output with testing data 10% (training data 90) with 1000 times simulation.
#pemanggilan paket yang digunakan
library(caret)
library(KernelKnn)
#perhitungan akurasi prediksi dengan 100 kali simulasi
sim = function(B, p1, k1)
{
hasileu = matrix(0, B, 4)
for (i in 1:B)
{
#memanggil data boston
boston = MASS::Boston
#membagi data train dan data test
indexes = createDataPartition(boston$medv, p = p1, list = F)
train = boston[indexes, ]
test = boston[-indexes, ]
train_x = train[, -14]
train_x = scale(train_x)[,]
train_y = train[,14]
test_x = test[, -14]
test_x = scale(test[,-14])[,]
test_y = test[,14]
#model prediksi regresi KNN
pred1 = KernelKnn(train_x, TEST_data = test_x, train_y, k = k1, method = 'euclidean', weights_function = NULL, regression = T)
#perhitungan akurasi
mse = mean((test_y - pred1)^2)
mae = mean(abs(test_y - pred1))
rmse = sqrt(mse)
mape = mean(abs((test_y - pred1)/test_y))
r2 = cor(test_y, pred1)^2
hasileu[i,1] = rmse
hasileu[i,2] = mape
hasileu[i,3] = mae
hasileu[i,4] = r2
}
return(apply(hasileu, 2, mean))
}
#penentuan rentang nilai K
hitung.variasi.k = function(B,p,K)
{
has11 = matrix(0,K,4)
for (i in 1:K)
{
has11[i,] = sim(B,p,i)
}
has11
}
#hasil akurasi berbagai proporsi data uji
K = 10
has11 = hitung.variasi.k(1000, 0.9,K)
Error in KernelKnn(train_x, TEST_data = test_x, train_y, k = k1, method = "euclidean", :
the TEST_data includes missing values

the following code uses tryCatch to catch 'error' and continue the exeuction to the end.
library(caret)
library(KernelKnn)
#perhitungan akurasi prediksi dengan 100 kali simulasi
sim = function(B, p1, k1)
{
hasileu = matrix(0, B, 4)
for (i in 1:B)
{
#memanggil data boston
boston = MASS::Boston
#membagi data train dan data test
indexes = createDataPartition(boston$medv, p = p1, list = F)
train = boston[indexes, ]
test = boston[-indexes, ]
train_x = train[, -14]
train_x = scale(train_x)[,]
train_y = train[,14]
test_x = test[, -14]
test_x = scale(test[,-14])[,]
test_y = test[,14]
errorFlag=0
#model prediksi regresi KNN
pred1 =tryCatch(
KernelKnn(train_x, TEST_data = test_x, train_y, k = k1, method = 'euclidean', weights_function = NULL, regression = T)
, error=function(e)
{ print(e); print(paste("error at loop", i));errorFlag<<-1;}
)
if (errorFlag==0)
{
#perhitungan akurasi
mse = mean((test_y - pred1)^2)
mae = mean(abs(test_y - pred1))
rmse = sqrt(mse)
mape = mean(abs((test_y - pred1)/test_y))
r2 = cor(test_y, pred1)^2
hasileu[i,1] = rmse
hasileu[i,2] = mape
hasileu[i,3] = mae
hasileu[i,4] = r2
}
}
return(apply(hasileu, 2, mean))
}
#penentuan rentang nilai K
hitung.variasi.k = function(B,p,K)
{
has11 = matrix(0,K,4)
for (i in 1:K)
{
has11[i,] = sim(B,p1=p,k1=i)
}
return(has11)
}
#hasil akurasi berbagai proporsi data uji
K = 10
has11 = hitung.variasi.k(B=1000, p=0.9,K=10)

Related

R: Genetic Algorithm Feature Selection 'replace = FALSE' error

I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!

Why does rjags give Dimension mismatch taking subset of y error here?

I have written this model but rjags gives dimension mismatch error; What's happening?
Error in jags.model(textConnection(model1), data = jags_data, n.chains = n_chains, :
RUNTIME ERROR:
Compilation error on line 8.
Dimension mismatch taking subset of y
library(rjags)
model1 <- "model {
C <- 10000
for (j in 1:nobs){
zeros[j] ~ dpois(phi[j])
phi[j] <- -log(L[j]) + C
L[j] <- add[j]*(lambda[j]^y[j])*(1-lambda[j])^(1-y[j])
add[j] = ifelse(lambda[j] == 0.5, 2, aux[j])
aux[j] = 2*arctanh(1 - 2*lambda[j] + 10^(-323))/(1 - 2*lambda[j] + 10^(-323))
logit(lambda[j]) <- inprod(X[j, ], beta)
}
beta[1] ~ dnorm(0,1)
beta[2] ~ dgamma(1,1)
}"
n_chains = 1
n_adapt = 5000
n_iter = 10000
n_thin = 1
n_burnin = 5000
# generate data
n = 100
Ffun = plogis
design_mat = cbind(1, matrix(seq(0,1,by = 0.2), ncol=1))
gen_data = function(n, beta) {
X = design_mat[sample(nrow(design_mat), size = n, replace = T), ]
lambda = Ffun(X %*% beta)
y = rcbern(n,lambda)
idx = is.nan(y)
y[idx] = runif(length(idx))
list(X = X, y = y)
}
rcbern = function(n,lam){
x = runif(n)
y = log((x*(2*lam-1) - (lam-1))/(1-lam))/log(lam/(1-lam))
return(y)
}
beta = as.matrix(c(-3, 5))
jags_data = gen_data(n, beta)
jags_data$nobs = n
jg_model <- jags.model(textConnection(model1),
data = jags_data,
n.chains = n_chains,
n.adapt = n_adapt)
update(jg_model, n.iter = n_burnin)
result <- coda.samples(jg_model,
variable.names = c("beta"),
n.iter = n_iter,
thin = n_thin,
n.chains = n_chains)
beta_est = list(apply(result[[1]],2,median))
As suggested by #user20650 the issue is that you are indexing y as vector and your functions are generating as a matrix. Try this code with a slight change in gen_data():
library(rjags)
model1 <- "model {
C <- 10000
for (j in 1:nobs){
zeros[j] ~ dpois(phi[j])
phi[j] <- -log(L[j]) + C
L[j] <- add[j]*(lambda[j]^y[j])*(1-lambda[j])^(1-y[j])
add[j] = ifelse(lambda[j] == 0.5, 2, aux[j])
aux[j] = 2*arctanh(1 - 2*lambda[j] + 10^(-323))/(1 - 2*lambda[j] + 10^(-323))
logit(lambda[j]) <- inprod(X[j, ], beta)
}
beta[1] ~ dnorm(0,1)
beta[2] ~ dgamma(1,1)
}"
n_chains = 1
n_adapt = 5000
n_iter = 10000
n_thin = 1
n_burnin = 5000
# generate data
n = 100
Ffun = plogis
design_mat = cbind(1, matrix(seq(0,1,by = 0.2), ncol=1))
gen_data = function(n, beta) {
X = design_mat[sample(nrow(design_mat), size = n, replace = T), ]
lambda = Ffun(X %*% beta)
y = rcbern(n,lambda)
y <- as.vector(y)
idx = is.nan(y)
y[idx] = runif(length(idx))
list(X = X, y = y)
}
rcbern = function(n,lam){
x = runif(n)
y = log((x*(2*lam-1) - (lam-1))/(1-lam))/log(lam/(1-lam))
return(y)
}
beta = as.matrix(c(-3, 5))
jags_data = gen_data(n, beta)
jags_data$nobs = n
jg_model <- jags.model(textConnection(model1),
data = jags_data,
n.chains = n_chains,
n.adapt = n_adapt)
update(jg_model, n.iter = n_burnin)
result <- coda.samples(jg_model,
variable.names = c("beta"),
n.iter = n_iter,
thin = n_thin,
n.chains = n_chains)
beta_est = list(apply(result[[1]],2,median))
Output:
beta_est
[[1]]
beta[1] beta[2]
-0.006031984 0.692007301
You can also try y <- y[,1,drop=T] in the same function instead of as.vector()

Multiple regression function does not work in R

I am trying to get the below piece of code to work - which performs multiple regression with different stepwise methods i.e. backward, forward, exhaustive, seqrep, but it is returning an error at the method = stepwise line.
# cross-validation
predict.regsubsets <- function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefs = coef(object, id = id)
xvars = names(coefs)
mat[, xvars] %*% coefs
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
cv <- function(stepwise, folds) {
num_folds = folds
set.seed(1)
folds = caret::createFolds(mtcars$mpg, k = num_folds)
fold_error = matrix(0, nrow = num_folds, ncol = num_vars,
dimnames = list(paste(1:num_folds), paste(1:num_vars)))
for(j in 1:num_folds) {
train_fold = mtcars[-folds[[j]], ]
validate_fold = mtcars[ folds[[j]], ]
best_fit = regsubsets(mpg ~ ., data = mtcars, nvmax = 10, method = stepwise)
for (i in 1:num_vars) {
pred = predict(best_fit, validate_fold, id = i)
fold_error[j, i] = rmse(actual = validate_fold$mpg,
predicted = pred)
}
}
cv_error = apply(fold_error, 2, mean)
par(mfrow=c(1,1))
plot(cv_error, type = 'l')
cv_min = which.min(cv_error)
points(cv_min, cv_error[cv_min], col = "red", cex = 2, pch = 20)
}
cv("backward",5)
What have I done wrong?

R doParallel: couldn't find function

I have set up the following function:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '', collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
According to this tutorial (https://topepo.github.io/caret/parallel-processing.html) I can parallelize my code just by calling library(doParallel); cl <- makePSOCKcluster(2); registerDoParallel(cl).
When I then use the function with doParallel
predictors_name <- c("Time", "Chick")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:500,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
the error message couldn't find function "MAPE" occurs.
How can I fix this without using the foreach syntax?
If I specify the package while calling the function like package::function, then it is working. Maybe there is a more elegant solution, but this is how I made the code running without an error:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- caret::train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '',
collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
predictors_name <- c("Time", "Chick", "Diet")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:578,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)

Custom classification threshold for GBM

I'm trying to create a custom GBM model that tunes the classification threshold for a binary classification problem. There is a nice example provided on the caret website here, but when I try to apply something similar to GBM I receive the following error:
Error in { : task 1 failed - "argument 1 is not a vector"
Unfortunately, I have no idea where the error is and the error isn't very helpful.
Here's an example, with the code that I've used for defining the custom GBM
library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
"n.minobsinnode", "threshold"),
class = rep("numeric", 5),
label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
"Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
if (search == "grid") {
grid <- expand.grid(n.trees = seq(50, 250, 50),
interaction.depth = 2, ### fix interaction depth at 2
shrinkage = 0.0001, ### fix learning rate at 0.0001
n.minobsinnode = seq(2, 10, 2),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
interaction.depth = sample(1:10, replace = TRUE, size = len),
shrinkage = runif(len, min = .001, max = .6),
n.minobsinnode = sample(5:25, replace = TRUE, size = len),
threshold = runif(1, 0, size = len))
grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
}
grid
}
## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
library(plyr)
loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$threshold)) {
index <- which(grid$n.trees == loop$n.trees[i] &
grid$interaction.depth == loop$interaction.depth[i] &
grid$shrinkage == loop$shrinkage[i] &
grid$n.minobsinnode == loop$n.minobsinnode[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if (any(names(theDots) == "distribution")) {
modDist <- theDots$distribution
theDots$distribution <- NULL
} else {
if (is.numeric(y)) {
stop("This works only for 2-class classification problems")
} else modDist <- if (length(lev) == 2) "bernoulli" else
stop("This works only for 2-class classification problems")
}
# if (length(levels(y)) != 2)
# stop("This works only for 2-class problems")
## check to see if weights were passed in (and availible)
if (!is.null(wts)) theDots$w <- wts
if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
modArgs <- list(x = x,
y = y,
interaction.depth = param$interaction.depth,
n.trees = param$n.trees,
shrinkage = param$shrinkage,
n.minobsinnode = param$n.minobsinnode,
distribution = modDist)
do.call("gbm.fit", modArgs)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
type = "response")#[, modelFit$obsLevels[1]]
out[is.nan(out)] <- NA
class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if (!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for (i in seq(along = submodels$threshold)) {
out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, type = "response",
n.trees = modelFit$tuneValue$n.trees)
out[is.nan(out)] <- NA
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
if (!is.null(submodels)) {
tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
tmp <- as.list(as.data.frame(tmp))
lapply(tmp, function(x, lvl) {
x <- cbind(x, 1 - x)
colnames(x) <- lvl
x}, lvl = modelFit$obsLevels)
out <- c(list(out), tmp)
}
out
}
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
And then some code showing how to use the custom model
set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
method = customGBM, metric = "Dist",
maximize = FALSE, tuneLength = 10,
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = fourStats))
The model appears to run, but finishes with the error from above. If someone could please help me with customising the GBM model to tune the GBM parameters, and the probability threshold for the classes that would be great.

Resources