dI'm new to R and ML but have a focused question that I am trying to answer.
I'm using my own data but following Matt Dancho's example here to predict attrition: http://www.business-science.io/business/2017/09/18/hr_employee_attrition.html
I have removed zero variance and scaled variables as per his update.
My issue is running the explain() on explainer step. I get variations of both errors below (in bold) when I run the former original code and the latter variation. Everything else runs up to that point.
explanation <- lime::explain(
as.data.frame(test_h2o[1:10,-1]),
explainer = explainer,
n_labels = 1,
n_features = 4,
kernel_width = 0.5)
gives:
Error during wrapup: arguments imply differing number of rows: 50000, 0
While
explanation <- lime::explain(
as.data.frame(test_h2o[1:500,-1]),
explainer = explainer,
n_labels = 1,
n_features = 5,
kernel_width = 1)
Gives:
ERROR: Unexpected HTTP Status code: 500 Server Error (url = http://localhost:54321/3/PostFile?destination_frame=C%3A%2FUsers%2Fsim.s%2FAppData%2FLocal%2FTemp%2FRtmpykNkl1%2Ffileb203a8d4a58.csv_sid_afd3_26)
Error: lexical error: invalid char in json text.
<html> <head> <meta http-equiv=
(right here) ------^
Please let me know if you have any ideas or insights for this problem, or need additional info from me.
Try this and let me know what you get. Note that this assumes your excel file is stored in a folder called "data" in your working directory. Use getwd() and setwd() to get/set the working directory (or use Projects in RStudio IDE).
library(h2o) # Professional grade ML pkg
library(tidyquant) # Loads tidyverse and several other pkgs
library(readxl) # Super simple excel reader
library(lime) # Explain complex black-box ML models
library(recipes) # Preprocessing for machine learning
hr_data_raw_tbl <- read_excel(path = "data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx")
hr_data_organized_tbl <- hr_data_raw_tbl %>%
mutate_if(is.character, as.factor) %>%
select(Attrition, everything())
recipe_obj <- hr_data_organized_tbl %>%
recipe(formula = Attrition ~ .) %>%
step_rm(EmployeeNumber) %>%
step_zv(all_predictors()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(data = hr_data_organized_tbl)
hr_data_bake_tbl <- bake(recipe_obj, newdata = hr_data_organized_tbl)
h2o.init()
hr_data_bake_h2o <- as.h2o(hr_data_bake_tbl)
hr_data_split <- h2o.splitFrame(hr_data_bake_h2o, ratios = c(0.7, 0.15), seed = 1234)
train_h2o <- h2o.assign(hr_data_split[[1]], "train" ) # 70%
valid_h2o <- h2o.assign(hr_data_split[[2]], "valid" ) # 15%
test_h2o <- h2o.assign(hr_data_split[[3]], "test" ) # 15%
y <- "Attrition"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 15
)
automl_leader <- automl_models_h2o#leader
explainer <- lime::lime(
as.data.frame(train_h2o[,-1]),
model = automl_leader,
bin_continuous = FALSE
)
explanation <- lime::explain(
x = as.data.frame(test_h2o[1:10,-1]),
explainer = explainer,
n_labels = 1,
n_features = 4,
n_permutations = 500,
kernel_width = 1
)
explanation
Related
Looking for a kind soul to help me solve this error in R with my current RF model:
Error: Response is constant across permutations. Please check your model
Here are the files needed to run the code: link
Here is my code:
library("lime")
library("randomForest")
RF <- readRDS("RF_classifier4sRNA.rds") # Load the model
origTrainingData <- read.csv( "training_combined.csv", header = TRUE, sep = ",") # load Orig Training data
origTrainingDataLabels <- read.csv( "training_combined_labels.csv", header = TRUE, sep = "," )
# load Orig Training data labes
Classification <- origTrainingDataLabels$Class
origTrainingDataWithLabels <- cbind(origTrainingData, Classification)
# instances to explain ----
inputFile <- "FeatureTable.tsv"
testData <- read.table( inputFile, sep = "\t", header = TRUE)
class(testData)
testDataPredictions <- predict(RF, testData, type="prob")
testDataPre
# randomForest
# RF <- readRDS("RF_classifier4sRNA.rds")
# pred <- predict(RF, data, type = "prob")
predict_model.randomForest <- function(x, newdata, type, ...) {
res <- predict(x, newdata = newdata, ...)
switch(
type,
raw = data.frame(Response = res$class, stringsAsFactors = FALSE),
prob = as.data.frame(res["posterior"], check.names = FALSE)
)
}
model_type.randomForest <- function(x, ...) 'classification'
?lime()
lime_explainer <- lime( origTrainingData, # Original training data
RF, # The model to explain
bin_continuous = TRUE, # Should continuous variables be binned
# when making the explanation
n_bins = 5, # The number of bins for continuous variables
# if bin_continuous = TRUE
quantile_bins = FALSE # Should the bins be based on n_bins quantiles
# or spread evenly over the range of the training data
)
lime_explanations <- explain( testData, # Data to explain
lime_explainer, # Explainer to use
n_labels = 7,
n_features = 7,
n_permutations = 10,
feature_select = "none"
)
lime_explanations
To be fair, I'm not the author of the original Random Forest model, which can be found here: github
and the full documentation and all other related files can be found (here)[https://peerj.com/articles/6304/]
I'm just trying to apply lime to that model.
Ultimately, my professor was able to help me out :D
So, here's how the functions should actually be for LIME to work in my particular use case:
predict_model.randomForest <- function(x, newdata, type, ...) {
res <- predict(x, newdata = newdata, ...)
switch(
type,
raw = data.frame(Response = ifelse(res[,2] > 0.5, "sRNA", "notSRNA"),
stringsAsFactors = FALSE
),
prob = res
)
print(class(res))
print(dim(res))
print(res)
}
model_type.randomForest <- function(x, ...) 'classification'
I have a question about the LIME package. I've made a model with lime like this example from thomasp85 (thank you):
devtools::install_github("thomasp85/lime")
setwd("C:/world-happiness-report")
load("data_15_16.RData")
# configure multicore
library(doParallel)
cl <- makeCluster(detectCores())
registerDoParallel(cl)
library(caret)
set.seed(42)
index <- createDataPartition(data_15_16$Happiness.Score.l, p = 0.7, list = FALSE)
train_data <- data_15_16[index, ]
test_data <- data_15_16[-index, ]
set.seed(42)
model_mlp <- caret::train(Happiness.Score.l ~ .,
data = train_data,
method = "mlp",
trControl = trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
verboseIter = FALSE))
library(lime)
explain <- lime(train_data, model_mlp, bin_continuous = TRUE, n_bins = 5, n_permutations = 1000)
pred <- data.frame(sample_id = 1:nrow(test_data),
predict(model_mlp, test_data, type = "prob"),
actual = test_data$Happiness.Score.l)
pred$prediction <- colnames(pred)[3:5][apply(pred[, 3:5], 1, which.max)]
pred$correct <- ifelse(pred$actual == pred$prediction, "correct", "wrong")
library(tidyverse)
pred_cor <- filter(pred, correct == "correct")
pred_wrong <- filter(pred, correct == "wrong")
test_data_cor %
mutate(sample_id = 1:nrow(test_data)) %>%
filter(sample_id %in% pred_cor$sample_id) %>%
sample_n(size = 3) %>%
remove_rownames() %>%
tibble::column_to_rownames(var = "sample_id") %>%
select(-Happiness.Score.l)
test_data_wrong %
mutate(sample_id = 1:nrow(test_data)) %>%
filter(sample_id %in% pred_wrong$sample_id) %>%
sample_n(size = 3) %>%
remove_rownames() %>%
tibble::column_to_rownames(var = "sample_id") %>%
select(-Happiness.Score.l)
explanation_cor <- explain(test_data_cor, n_labels = 3, n_features = 5)
explanation_wrong <- explain(test_data_wrong, n_labels = 3, n_features = 5)
plot_features(explanation_wrong, ncol = 3)
The plot produces the following exampple for a certain case:
What I would like to know: is there a way to get numbers on how the probability would change when (for example) Family would be higher, so for example:
Family in this case has level: 0.71. When Family would increase with
0.1 this would cause the probability to increase with 0.5%
The reason for this is that I have got the same graph type, and I would like to create additional information of the meaning of the effect that is indicated in the graph.
Any information, also in the form of literature etc would be helpful :)
It seems you are asking for counterfactuals. LIME does not really provides this functionality. It is not their goal. You have to create the counterfactuals manually and submit them to LIME in the same iteration because LIME is non-deterministic -- it selects randomly the neighbors to fit the interpretable model-- so you get different results every time you run it.
I am trying to run build a knn model using caret with my dataset where True (real sales), DOW (Day of the week), and D1 to D10 (historic sales) are available.
library(caret)
library(reshape2)
library(dplyr)
library(tibble)
library(dummies)
#data
rm = matrix(rnorm(100*10, 10, 5), nrow = 100) %>% as.data.frame()
wide = cbind(
rnorm(100, 100, 1),
weekdays(seq(as.Date('2019/1/1'), by='day', length.out = 100)),
rm
)
colnames(wide) = c('true', 'DOW', paste0('D',1:10))
#preprocessing for knn
train.true = train[,1]
dow.tr = dummy(train$DOW, sep='.')
dow.te = dummy(test$DOW, sep='.')
k.train = cbind(train[, -c(2, nearZeroVar(train))], dow.tr)
k.test = cbind(test, dow.te)[,-2]
seq.knn.pre1 = rep(0, nrow(test))
for (i in 1:10){
this.train = k.train[, c((i+1):ncol(k.train))]
this.test = k.test[i, c((i+1):ncol(k.test))]
train.control = trainControl(method='repeatedcv', number=10, repeats = 1)
k = train(train.true~., method='knn', tuneLength = 8,
trControl=train.control, preProcess='scale',
data=data.frame(train.true, this.train))
seq.knn.pre1[i] = predict(k, this.test)
}
seq.knn.pre1 = cbind(true = test[,1], k.pred1 = seq.knn.pre1) %>% data.frame()
However, when I am knitting the file, it gives me error object 'X.Rachel.Documents.Research.file.Rmd.Friday' not found Calls: <Anonymous> ... predict.train -> model.frame -> model.frame.default -> eval -> eval Execution halted.
I am guessing the problem might come from the DOW dummy variables. When my simulated dataset does not include categorical variables, the code knitted well. Is there any possibility that I can fix it there?
Any suggestion is highly appreciated!
I'm building a text classifier of Clinton & Trump tweets (data can be found on Kaggle ).
I'm doing EDA and modelling using quanteda package:
library(dplyr)
library(stringr)
library(quanteda)
library(lime)
#data prep
tweet_csv <- read_csv("tweets.csv")
tweet_data <- tweet_csv %>%
select(author = handle,
text,
retweet_count,
favorite_count,
source_url,
timestamp = time) %>%
mutate(date = as_date(str_sub(timestamp, 1, 10)),
hour = hour(hms(str_sub(timestamp, 12, 19))),
tweet_num = row_number()) %>%
select(-timestamp)
# creating corpus and dfm
tweet_corpus <- corpus(tweet_data)
edited_dfm <- dfm(tweet_corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
set.seed(32984)
trainIndex <- sample.int(n = nrow(tweet_csv), size = floor(.8*nrow(tweet_csv)), replace = F)
train_dfm <- edited_dfm[as.vector(trainIndex), ]
train_raw <- tweet_data[as.vector(trainIndex), ]
train_label <- train_raw$author == "realDonaldTrump"
test_dfm <- edited_dfm[-as.vector(trainIndex), ]
test_raw <- tweet_data[-as.vector(trainIndex), ]
test_label <- test_raw$author == "realDonaldTrump"
# making sure train and test sets have the same features
test_dfm <- dfm_select(test_dfm, train_dfm)
# using quanteda's NB model
nb_model <- quanteda::textmodel_nb(train_dfm, train_labels)
nb_preds <- predict(nb_model, test_dfm)
# defining textmodel_nb as classification model
class(nb_model)
model_type.textmodel_nb_fitted <- function(x, ...) {
return("classification")
}
# a wrapper-up function for data preprocessing
get_matrix <- function(df){
corpus <- corpus(df)
dfm <- dfm(corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
}
then I define the explainer - no problems here:
explainer <- lime(train_raw[1:5],
model = nb_model,
preprocess = get_matrix)
But when I run an explainer, even on exactly same dataset as in explainer, I get an error:
explanation <- lime::explain(train_raw[1:5],
explainer,
n_labels = 1,
n_features = 6,
cols = 2,
verbose = 0)
Error in predict.textmodel_nb_fitted(x, newdata = newdata, type = type, :
feature set in newdata different from that in training set
Does it have something to do with quanteda and dfms? I honestly don't see why this should happen. Any help will be great, thanks!
We can trace the error to predict_model, which calls predict.textmodel_nb_fitted (I used only the first 10 rows of train_raw to speed up computation):
traceback()
# 7: stop("feature set in newdata different from that in training set")
# 6: predict.textmodel_nb_fitted(x, newdata = newdata, type = type,
# ...)
# 5: predict(x, newdata = newdata, type = type, ...)
# 4: predict_model.default(explainer$model, case_perm, type = o_type)
# 3: predict_model(explainer$model, case_perm, type = o_type)
# 2: explain.data.frame(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
# 1: lime::explain(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
The problem is that predict.textmodel_nb_fitted expects a dfm, not a data frame. For example, predict(nb_model, test_raw[1:5]) gives you the same "feature set in newdata different from that in training set" error. However, explain takes a data frame as its x argument.
A solution is to write a custom textmodel_nb_fitted method for predict_model that does the necessary object conversions before calling predict.textmodel_nb_fitted:
predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
X <- corpus(newdata)
X <- dfm_select(dfm(X), x$data$x)
res <- predict(x, newdata = X, ...)
switch(
type,
raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
prob = as.data.frame(res$posterior.prob, check.names = FALSE)
)
}
This gives us
explanation <- lime::explain(train_raw[1:10, 1:5],
explainer,
n_labels = 1,
n_features = 5,
cols = 2,
verbose = 0)
explanation[1, 1:5]
# model_type case label label_prob model_r2
# 1 classification 1 FALSE 0.9999986 0.001693861
There is an error at the final stage of this analysis. When running explain() function on an h2o model, I get the following error:
Error: All permutations have no similarity to the original observation. Try setting bin_continuous to TRUE and/or increase kernel_size
I have tried both the suggestions in the error. If I change the bin_continous to TRUE, the lime() function does not work and other kernel sizes do not work either.
Any thought on how to solve this and therefore be able to get the results with the plot_features() function?
library(readxl)
library(httr)
library(dplyr)
library(h2o)
library(lime)
GET("https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_FnUseC_-HR-Employee-Attrition.xlsx",
write_disk(tf <- tempfile(fileext = ".xls")))
hr_data_raw <- read_xlsx(tf)
hr_data <- hr_data_raw %>%
mutate_if(is.character, as.factor) %>%
select(Attrition, everything())
h2o.init()
h2o.no_progress()
hr_data_h2o <- as.h2o(hr_data)
split_h2o <- h2o.splitFrame(hr_data_h2o, c(0.7, 0.15), seed = 1234 )
train_h2o <- h2o.assign(split_h2o[[1]], "train" ) # 70%
valid_h2o <- h2o.assign(split_h2o[[2]], "valid" ) # 15%
test_h2o <- h2o.assign(split_h2o[[3]], "test" ) # 15%
y <- "Attrition"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 30)
automl_leader <- automl_models_h2o#leader
explainer <- lime::lime(
as.data.frame(train_h2o[,-1]),
model = automl_leader,
bin_continuous = F)
explanation <- lime::explain(
as.data.frame(test_h2o[1:10, -1]),
explainer = explainer,
n_labels = 1,
n_features = 4)
# Error: All permutations have no similarity to the original observation.
# Try setting bin_continuous to TRUE and/or increase kernel_size
# Cannot Continue
plot_features(explanation)