loop over columns for regression using purrr and caret - r

I am trying to loop over regressions using purrr and caret, but I have trouble passing arguements.
# sample dataframe
foo <- data.frame(y1 = runif(10),
y2 = runif(10),
y3 = runif(10),
x1 = runif(10),
x2 = runif(10),
x3 = runif(10)
)
# list of dependent and independent variables
Yvars <- c("y1","y2","y3")
Xvars <- c("x1","x2","x3")
# library(caret)
# custom caret function to loop over vars
caretlm <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(x = eval(substitute(xvars)), y = eval(substitute(yvars)), data = data,
method = "lm",
trControl = trainControl(method = "cv")
)
}
# library(purrr)
modellist_lm <- map2(xvars, yvars, ~caretlm(.x, .y, foo) )
# Error in eval(substitute(xvars)) : object '.x' not found
when I do not use eval and substitute, then I get another error
caretlm2 <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(x = xvars, y = yvars, data = data,
method = "lm",
trControl = trainControl(method = "cv")
)
}
modellist_lm <- map2(xvars, yvars, ~caretlm2(.x, .y, foo) )
# Error: Please use column names for `x`
Please suggest if there are better methods or frameworks..

Not sure about the x, y method, but that function has a formula method which appears to me to be easier to work with (note that I changed Data to data)
caretlm <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(reformulate(xvars, yvars), data = foo,
method = "lm",
trControl = trainControl(method = "cv")
)
}
modellist_lm <- map2(Xvars, Yvars, ~caretlm(.x, .y, foo))

Related

R caret: train() failed for repeatedcv with factor predictors

The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)

How to have output from lm() include std. error and others without using summary() for stargazer

I'm fitting several linear models in r in the following way:
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
summary(fit) #fit
}
gbind_lm <- function(vv) {
n <- vv %>% length()
fits <- list()
coefs <- list()
ses <- list()
for (i in 1:n ) {
coefs[[i]] <- vv[[i]]$coefficients[,1]
ses[[i]] <- vv[[i]]$coefficients[,2]
fits[[i]] <- vv[[i]]
}
list("fits" = fits, "coefs" = coefs, "ses" = ses)
}
stargazer_lm <- function(mylist, fname, title_str,m_type = "html",...) {
stargazer(mylist$fits, coef = mylist$coefs,
se = mylist$ses,
type = m_type, title = title_str,
out = paste0("~/projects/outputs",fname), single.row = T ,...)
}
p_2 <- map(x_str,
~ regf_lm (df = df ,
y_var = "y", x_str = .))
m_all <- do.call(c, list(p_2)) %>% gbind_lm()
stargazer_lm(m_all,"name.html","My model", m_type = "html")
In regf_lm, if I use summary(fit) on the last line, I'm able to generate reg output with columns for estimated coefficients, std. error, etc. But Stargazer() does not work with summary(lm()) (returns error $ operator is invalid for atomic vectors). However, if I just use "fit" on the last line in regf_lm, the output shows only the estimated coefficients and not std error, R sq...and gbind_lm() won't work because I cannot extract ses or fit.
Any advice is greatly appreciated.
You can directly export model statistics in tidy format with the package broom
library(broom)
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x1 + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
return(list(fit,select(broom::tidy(fit),std.error))) #fit
}
exm_model <- regf_lm(iris,'Sepal.Width','Sepal.Length')
stargazer(exm_model[[1]], coef = exm_model[[2]], title = 'x_model',
out ='abc', single.row = T)
This piece of code worked on my local with no problem, I think you can apply this in your workflow.

how to fix Error " Error in `[.data.frame`(data, , all.vars(Terms), drop = FALSE) : undefined columns selected" in caret package

i encounter this error :
Error in [.data.frame(data, , all.vars(Terms), drop = FALSE) :
undefined columns selected
when i use caret to do regression with using bootstrap for 3 different cluster(index column)
library("tidyverse")
library("lattice")
library("caret")
library("janitor")
data<- read.csv("C:/Users/asus/Desktop/test.csv",header = TRUE)
mydata <- data.frame(index = data$cluster,
x = data[,3:4],
y = data[,5])
tab <- table(mydata$index)
tab
sample_n(mydata, 3)
attach(mydata)
mylist <- list()
mydata <- clean_names(mydata)
head(mydata)
for (i in 1:length(unique(mydata$index))) {
# define training control
train.ctrl <- trainControl(method = "boot", number = tab[i])
# train the model
mylist[[i]] <- train(mydata[index == i,"y"] ~ mydata[index ==i,"x_xa"] + mydata[index == i,"x_xb"], data = data.frame(mydata), method = "lm",
trControl = train.ctrl)
print(mylist[[i]])
summary(mylist[[i]])
}
here you can see my data:
You can subset the data within each iteration and apply the same formula, since each dataset would have the same columns. Try to go through the help page again. Also, please include only the relevant code.
Let's say your data is like this:
set.seed(111)
mydata = data.frame(index = sample(1:3,500,replace=TRUE),
x1 = rnorm(500),
x2 = rnorm(500),
y = runif(500)
)
Then something like this:
library(caret)
tab <- table(mydata$index)
mylist <- list()
for (i in unique(mydata$index)) {
train.ctrl <- trainControl(method = "boot", number = tab[i])
mylist[[i]] <- train(y ~ x1 + x2,
data = subset(mydata, index == i),
method = "lm",
trControl = train.ctrl)
}

step/stepAIC on glms constructed inside function calls

I'm having a problem linked to visibility/environment. In short, glms constructed inside functions can't be simplifed using step/stepAIC:
foo = function(model) {
m = glm(y~x, family=model$family, data = dframe)
return(m)
}
y = rbinom(100, 1, 0.5)
x = y*rnorm(100) + rnorm(100)
dframe = data.frame(y, x)
m = glm(y~x, family='binomial', data = dframe)
m2 = foo(m)
library(MASS)
summary(m2)
print(m2$family)
m3 = stepAIC(m2, k = 2)
This results in the following error:
Error in glm(formula = y ~ 1, family = model$family, data = dframe) :
object 'model' not found
This despite m2 looking like it fit well and the family is defined. Sorry if the example is a little contrived.
Found the solution - the original glm needs to be constructed with do.call.
foo = function(model) {
form.1<-as.formula(y ~ x)
dat = model$data
fam = model$family
m <- do.call("glm", list(form.1, data=dat, family=fam))
##m = glm(y~x, family='binomial', data = model$dframe)
return(m)
}
y = rbinom(100, 1, 0.5)
x = y*rnorm(100) + rnorm(100)
dframe = data.frame(y, x)
m = glm(y~x, family='binomial', data = dframe)
m2 = foo(m)
library(MASS)
summary(m2)
print(m2$family)
m3 = stepAIC(m2, k = 2)

Caret package Custom metric

I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))

Resources