Related
I'm trying to make an R script to allow users to input a dataset and then display the predictive rate parity graph for their corresponding dataset. I have most of the code but when I attempt to test it with a dataset, I receive an error.
The code is below:
library(mltools)
library(fairness)
library(dplyr)
library(data.table)
calculate_fairness_metric <- function(newdata, target, sensitive_attr, base) {
set.seed(77)
val_percent <- 0.2
val_idx <- sample(1:nrow(new_data))[1:round(nrow(new_data) * val_percent)]
df_train <- new_data[-val_idx, ]
df_valid <- new_data[ val_idx, ]
model1 <- glm(target ~ .,
data = df_train,
family = binomial(link = 'logit'))
df_valid$prob_1 <- predict(model1, df_valid, type = 'response')
res1 <- pred_rate_parity(data = df_valid,
outcome = target,
outcome_base = '0',
group = sensitive_attr,
probs = 'prob_1',
cutoff = 0.5,
base = base)
return(res1$Metric)
}
calculate_fairness_metric(revised, "readmitted", "race", "Caucasian")
Error in model.frame.default(formula = target ~ ., data = df_train, drop.unused.levels = TRUE) :
variable lengths differ (found for 'race')
The dataset I used is below:
dataset image
The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)
I'm trying to create a loop (performed 10 times) to get repeated cross-validation to evaluate the predictive performance of 4 models, and then I have to compute an average of my performances. I must admit that I am new to R and I am struggling with this simple task. I started by creating my formula to cross-validate my models.
loss.mse <- function(fit, df, y, transf){
y_pred <- transf(predict(fit, df))
out <- (y - y_pred)^2
return(mean(out))
}
loss.mae <- function(fit, df, y, transf){
y_pred <- transf(predict(fit, df))
out <- abs(y - y_pred)
return(mean(out))
}
validate.cv <- function(data, folds, model_fn, y_var,
transf = identity, seed)
{
set.seed(seed)
fold_id <- sample(rep(1:folds, length.out = nrow(data)))
out.mse <- out.mae <- numeric(folds)
for(test in 1:folds){
data_test <- subset(data, fold_id == test)
data_train <- subset(data, fold_id != test)
fit <- model_fn(data_train)
y_test <- y_var[fold_id == test]
out.mse[test] <- loss.mse(fit, data_test, y_test, transf)
out.mae[test] <- loss.mae(fit, data_test, y_test, transf)
}
return(list(MAE = mean(out.mae), MSE = out.mse,
RMSE = sqrt(mean(out.mse))))
Then I named my models and I cross-validated them but I can't figure out how to get my 10 times loop !
model_lm <- function(data) lm(StockPrice ~., data)
model_step <- function(data) step(lm(StockPrice ~., data), trace = 0)
model_rpart <- function(data)
{
set.seed(1234)
mod.rpart <- rpart(StockPrice ~ ., data, cp = 0.0001, model = TRUE)
cp.select <- function(big.tree) {
min.x <- which.min(big.tree$cptable[, 4])
for(i in 1:nrow(big.tree$cptable)) {
if(big.tree$cptable[i, 4] <
(big.tree$cptable[min.x, 4] + big.tree$cptable[min.x, 5]))
return(big.tree$cptable[i, 1])
}
}
mod.rpart.prune <- prune(mod.rpart, cp = cp.select(mod.rpart))
return(mod.rpart.prune)
}
model_step_gam <- function(data)
{
mod <- model_step(data)
predictors <- all.vars(terms(mod))[-1]
f <- as.formula(
paste("StockPrice",
paste(paste("s(",predictors, ")"), collapse = " + "),
sep = " ~ "))
mod_gam <- gam(f, data = data)
seed<-1234
m.log.full <-validate.cv(log.Finance, 10, model_lm, Finance$StockPrice,exp, seed)
m.log.step <-validate.cv(log.Finance, 10, model_step, Finance$StockPrice,exp, seed)
m.log.rpart <-validate.cv(log.Finance, 10, model_rpart, Finance$StockPrice,exp, seed)
m.log.gam <-validate.cv(log.Finance, 10, model_step_gam, Finance$StockPrice,exp, seed)
mat.test <-data.frame(Model =c("Full (log)","Step (log)","CART (log)", "Step GAM (log)"),
RMSE =c(m.log.full$RMSE, m.log.step$RMSE,m.log.rpart$RMSE, m.log.gam$RMSE),
MAE =c(m.log.full$MAE, m.log.step$MAE, m.log.rpart$MAE,m.log.gam$MAE))
print(mat.test)
If you have any ideas I would gladly try them. Thank you in advance for your help :)
I am currently trying to solve a bug but believe the data I am working with may be too complex and cause errors that shouldn't normally occur. I've written a function, and was hoping to add a try or tryCatch statement to skip the error if it occurs. I currently have:
library(glmnet)
foo <- function(data, ols_ps = TRUE, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
if (ols_ps == TRUE) {
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x)) & ncol(new_x) > 0) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
} else {
coef <- coef
}
} else {
coef <- coef
}
return(coef)
}
which normally works and works on most datasets. I think the error may be coming from a complex dataset. Is it possible to skip OLS if I get the below error?
"Error in x[, coef_nonzero, drop = FALSE] : \n (subscript) logical subscript too long\n"
attr(,"class")
Here is a minimal working example per request.
set.seed(123)
matrix <- matrix(runif(1000), ncol=10)
boot(matrix,foo,R=50)
Thanks in advance.
Maybe like this?
foo <- function(data, index) {
# index is the bootstrap sample index
x <- data[index, -1]
y <- data[index, 1]
ridge <- cv.glmnet(x, y, alpha = 0)
## The intercept estimate should be dropped.
weights <- as.numeric(coef(ridge, s = ridge$lambda.min))[-1]
# alpha=1, lasso
alasso <- cv.glmnet(x, y, alpha = 1,
penalty.factor = 1 / abs(weights))
# Select nonzero coefficients
coef <- as.vector(coef(alasso, s = alasso$lambda.min,
exact = TRUE, x = x, y = y,
penalty.factor = 1 / abs(weights)))[-1]
coef_nonzero <- coef != 0
new_x <- tryCatch(x[, coef_nonzero, drop = FALSE],
error=function(e) NA)
if (!any(is.na(new_x))) {
ls.obj <- lm(y ~ new_x)
ls_coef <- (ls.obj$coefficients)[-1]
coef[coef_nonzero] <- ls_coef
}
return(coef)
}
The problem is that we have no case when it fails so far.
Based on this topic, I have created a function that returns a dataset with variables related to the outcome (y) by specific linear coefs.
simulate_data_regression <- function(sample=10, coefs=0, error=0){
n_var <- length(coefs)
X <- matrix(0, ncol=n_var, nrow=sample)
beta <- as.matrix(coefs)
for (i in 1:n_var){
X[,i] <- scale(rnorm(sample, 0, 1))
}
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
return(data)
}
data <- simulate_data_regression(sample=50, coefs=c(0.1, 0.8), error=0)
summary(data)
sd(data$V1)
sd(data$y)
It works great. However, I would need to have a standardized y (mean 0 and SD 1). But when I try to scale it, the coefficients change:
data <- simulate_data_regression(sample=50, coefs=c(0.1, 0.8), error=0)
data$y <- as.vector(scale(data$y))
coef(lm(y ~ ., data=data))
It is possible to do such thing? Thank you very much!
Edit
In other words, I would like the coefs that are specified to be standardized coefs (expressed in outcome's SD).
Scaling y a posteriori changes the coefs by 1/sd(y). However, I can't think of any way to change the betas before generating y, so that the betas return to their specified value after the scaling of y.
Edit 2: Failed attempt
I've tried running the function twice, first extracting sd(y) and scaling the coefficients with it, in the hope that those scaled coefficients will change to the specified ones once I'll scale y. But it doens't work, which is expected, as sd(y) changes when I change the coefs :'(
Here's the failed attempt:
simulate_data_regression <- function(sample=10, coefs=0, error=0, standardized=TRUE){
stuff <- .simulate_data_regression(sample=sample, coefs=coefs, error=error)
if(standardized == TRUE){
y_sd <- sd(data$y)
data <- .simulate_data_regression(sample=sample, coefs=y_sd*coefs, error=error, X=stuff$X)$data
data$y <- as.vector(scale(data$y))
} else{
data <- stuff$data
}
return(data)
}
.simulate_data_regression <- function(sample=10, coefs=0, error=0, X=NULL, y=NULL){
n_var <- length(coefs)
if(is.null(X)){
X <- matrix(0, ncol=n_var, nrow=sample)
for (i in 1:n_var){
X[,i] <- scale(rnorm(sample, 0, 1))
}
}
beta <- as.matrix(coefs)
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
return(list(X=X, y=y, data=data))
}
If you scale y the inference will be the same, only the p-values of the intercepts change, not the p-values of the coefficients.
In this example I have set error = 1.
set.seed(1234) # Make the results reproducible
data <- simulate_data_regression(sample = 50, coefs = c(0.1, 0.8), error = 1)
data2 <- data
data2$y <- scale(data2$y)
fit <- lm(y ~ ., data)
fit2 <- lm(y ~ ., data2)
summary(fit)
summary(fit2)
As you can see the p-values of the coefficients are exactly the same though the coefficients themselves are different. You would expect that since you are scaling by the standard errors of the regressors and therefore the coefficients will be scaled by the inverses of those standard errors.
The version of your function below has an argument, which, that allows to specify which regressors to scale. Its default is all of them.
simulate_data_regression2 <- function(sample = 10, coefs = 0, error = 0, which = seq_along(coefs)){
n_var <- length(coefs)
X <- matrix(0, ncol=n_var, nrow=sample)
beta <- as.matrix(coefs)
for (i in 1:n_var){
X[,i] <- rnorm(sample, 0, 1)
if(i %in% which) X[, i] <- scale(X[, i])
}
y <- X %*% beta
if(error != 0){
y <- y + rnorm(sample, 0, error)
}
data = data.frame(X=X)
names(data) <- paste0("V", 1:n_var)
data$y <- as.vector(y)
data
}
Now test the function.
set.seed(1234) # Make the results reproducible
data <- simulate_data_regression2(sample=50, coefs=c(0.1, 0.8), error=1)
set.seed(1234) # Reproduce the data generation process
data2 <- simulate_data_regression2(sample=50, coefs=c(0.1, 0.8), error=1, which = 2)
fit <- lm(y ~ ., data)
fit2 <- lm(y ~ ., data2)
As you can see the coefficients of V2 are equal.
coef(fit)
#(Intercept) V1 V2
# 0.01997809 0.19851020 0.96310013
coef(fit2)
#(Intercept) V1 V2
# 0.07040538 0.21130549 0.96310013
The p-values of the estimates of the coefficients V2 are also equal
summary(fit)
summary(fit2)