There is an error at the final stage of this analysis. When running explain() function on an h2o model, I get the following error:
Error: All permutations have no similarity to the original observation. Try setting bin_continuous to TRUE and/or increase kernel_size
I have tried both the suggestions in the error. If I change the bin_continous to TRUE, the lime() function does not work and other kernel sizes do not work either.
Any thought on how to solve this and therefore be able to get the results with the plot_features() function?
library(readxl)
library(httr)
library(dplyr)
library(h2o)
library(lime)
GET("https://community.watsonanalytics.com/wp-content/uploads/2015/03/WA_FnUseC_-HR-Employee-Attrition.xlsx",
write_disk(tf <- tempfile(fileext = ".xls")))
hr_data_raw <- read_xlsx(tf)
hr_data <- hr_data_raw %>%
mutate_if(is.character, as.factor) %>%
select(Attrition, everything())
h2o.init()
h2o.no_progress()
hr_data_h2o <- as.h2o(hr_data)
split_h2o <- h2o.splitFrame(hr_data_h2o, c(0.7, 0.15), seed = 1234 )
train_h2o <- h2o.assign(split_h2o[[1]], "train" ) # 70%
valid_h2o <- h2o.assign(split_h2o[[2]], "valid" ) # 15%
test_h2o <- h2o.assign(split_h2o[[3]], "test" ) # 15%
y <- "Attrition"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 30)
automl_leader <- automl_models_h2o#leader
explainer <- lime::lime(
as.data.frame(train_h2o[,-1]),
model = automl_leader,
bin_continuous = F)
explanation <- lime::explain(
as.data.frame(test_h2o[1:10, -1]),
explainer = explainer,
n_labels = 1,
n_features = 4)
# Error: All permutations have no similarity to the original observation.
# Try setting bin_continuous to TRUE and/or increase kernel_size
# Cannot Continue
plot_features(explanation)
Related
I'm learning keras, and would like to see the predicted numbers that are returned. The model has a number of items returned, but none of them seem to be the predicted values.
df <- MASS::Boston
index <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
train_features <- Boston[index,]
test_features <- Boston[!index,]
train_labels <- Boston$medv[index]
test_labels <- Boston$medv[!index]
train_features <- scale(train_features)
train_features <- train_features[,1:ncol(train_features)]
test_features <- scale(test_features)
test_features <- test_features[,1:ncol(test_features)]
mean <- apply(train_features, 2, mean)
sd <- apply(train_features, 2, sd)
train_data <- scale(train_features, center = mean, scale = sd)
test_data <- scale(test_features, center = mean, scale = sd)
train_targets <- Boston$medv[index]
test_targets <- Boston$medv[!index]
Here is where the model is built:
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu") %>%
layer_dense(64, activation = "relu") %>%
layer_dense(1)
model %>% compile(optimizer = "rmsprop",
loss = "mse",
metrics = "mse")
model
}
Next we set up five folds, and track all_scores:
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 100
all_scores <- numeric()
for (i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- train_data[val_indices, ]
val_targets <- train_targets[val_indices]
partial_train_data <- train_data[-val_indices, ]
partial_train_targets <- train_targets[-val_indices]
model <- build_model()
model %>% fit (
partial_train_data,
partial_train_targets,
epochs = num_epochs,
batch_size = 16,
verbose = 0
)
results <- model %>%
evaluate(val_data, val_targets, verbose = 0)
all_scores[[i]] <- results[['mse']]
}
keras.RMSE <- sqrt(mean(all_scores))
However, none of the variables seem to have the predicted values. A few examples:
all_scores is a set of RMSE scores (which I also want)
val_targets appears to be the wrong dimensions
model$fit does not return a value or set of values
model$predict generates predicted values, but those have already been generated, and I can't locate them.
How are the predicted values returned in a keras model?
I am trying to replicate this https://cast42.github.io/blog/datascience/python/clustering/altair/shap/2020/04/23/explain-clusters-to-business.html#Kmeans-clustering
But using R and not Python as in the article.
What I haven't managed to get is the "Shapley values for the three clusters" part:
for cnr in df_km['cluster'].unique():
shap.summary_plot(shap_values[cnr], X, max_display=30, show=False)
plt.title(f'Cluster {cnr}') plt.show()
These are the results I've gotten so far. Note that I want to output the graph according to the label variable of the classification model.
Thanks!
# Package names
packages <- c("splitstackshape", "shapr", "Matrix", "xgboost", "SHAPforxgboost")
# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])}
winequality <- read.csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";")
#KMeans clasifier attribute evaluation
winequality_escale <- scale(winequality)
set.seed(123)
km.res_3 <- kmeans(winequality_escale, 3, nstart = 25)
km.res_3$size
km.res_3$centers
aggregate(winequality, by=list(cluster=km.res_3$cluster), mean)
k3 <- fviz_cluster(km.res_3, data=winequality_escale, palette= c("#2E9FDF", "#00AFBB", "#E7B800"), ellipse.type = "euclid", star.plot= T, repel = T, ggtheme = theme_minimal()) + ggtitle("k = 3")
winequality <- as.matrix(winequality)
model <- xgboost(
data = winequality,
label = km.res_3$cluster,
nround = 20,
verbose = FALSE)
shap_values <- shap.values(xgb_model = model, X_train = winequality)
shap_values$mean_shap_score
shap_values <- shap_values$shap_score
# shap.prep() returns the long-format SHAP data from either model or
shap_long <- shap.prep(xgb_model = model, X_train = winequality)
# is the same as: using given shap_contrib
shap_long <- shap.prep(shap_contrib = shap_values$shap_score, X_train = winequality)
# **SHAP summary plot**
shap.plot.summary(shap_long)
Looking for a kind soul to help me solve this error in R with my current RF model:
Error: Response is constant across permutations. Please check your model
Here are the files needed to run the code: link
Here is my code:
library("lime")
library("randomForest")
RF <- readRDS("RF_classifier4sRNA.rds") # Load the model
origTrainingData <- read.csv( "training_combined.csv", header = TRUE, sep = ",") # load Orig Training data
origTrainingDataLabels <- read.csv( "training_combined_labels.csv", header = TRUE, sep = "," )
# load Orig Training data labes
Classification <- origTrainingDataLabels$Class
origTrainingDataWithLabels <- cbind(origTrainingData, Classification)
# instances to explain ----
inputFile <- "FeatureTable.tsv"
testData <- read.table( inputFile, sep = "\t", header = TRUE)
class(testData)
testDataPredictions <- predict(RF, testData, type="prob")
testDataPre
# randomForest
# RF <- readRDS("RF_classifier4sRNA.rds")
# pred <- predict(RF, data, type = "prob")
predict_model.randomForest <- function(x, newdata, type, ...) {
res <- predict(x, newdata = newdata, ...)
switch(
type,
raw = data.frame(Response = res$class, stringsAsFactors = FALSE),
prob = as.data.frame(res["posterior"], check.names = FALSE)
)
}
model_type.randomForest <- function(x, ...) 'classification'
?lime()
lime_explainer <- lime( origTrainingData, # Original training data
RF, # The model to explain
bin_continuous = TRUE, # Should continuous variables be binned
# when making the explanation
n_bins = 5, # The number of bins for continuous variables
# if bin_continuous = TRUE
quantile_bins = FALSE # Should the bins be based on n_bins quantiles
# or spread evenly over the range of the training data
)
lime_explanations <- explain( testData, # Data to explain
lime_explainer, # Explainer to use
n_labels = 7,
n_features = 7,
n_permutations = 10,
feature_select = "none"
)
lime_explanations
To be fair, I'm not the author of the original Random Forest model, which can be found here: github
and the full documentation and all other related files can be found (here)[https://peerj.com/articles/6304/]
I'm just trying to apply lime to that model.
Ultimately, my professor was able to help me out :D
So, here's how the functions should actually be for LIME to work in my particular use case:
predict_model.randomForest <- function(x, newdata, type, ...) {
res <- predict(x, newdata = newdata, ...)
switch(
type,
raw = data.frame(Response = ifelse(res[,2] > 0.5, "sRNA", "notSRNA"),
stringsAsFactors = FALSE
),
prob = res
)
print(class(res))
print(dim(res))
print(res)
}
model_type.randomForest <- function(x, ...) 'classification'
I'm building a text classifier of Clinton & Trump tweets (data can be found on Kaggle ).
I'm doing EDA and modelling using quanteda package:
library(dplyr)
library(stringr)
library(quanteda)
library(lime)
#data prep
tweet_csv <- read_csv("tweets.csv")
tweet_data <- tweet_csv %>%
select(author = handle,
text,
retweet_count,
favorite_count,
source_url,
timestamp = time) %>%
mutate(date = as_date(str_sub(timestamp, 1, 10)),
hour = hour(hms(str_sub(timestamp, 12, 19))),
tweet_num = row_number()) %>%
select(-timestamp)
# creating corpus and dfm
tweet_corpus <- corpus(tweet_data)
edited_dfm <- dfm(tweet_corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
set.seed(32984)
trainIndex <- sample.int(n = nrow(tweet_csv), size = floor(.8*nrow(tweet_csv)), replace = F)
train_dfm <- edited_dfm[as.vector(trainIndex), ]
train_raw <- tweet_data[as.vector(trainIndex), ]
train_label <- train_raw$author == "realDonaldTrump"
test_dfm <- edited_dfm[-as.vector(trainIndex), ]
test_raw <- tweet_data[-as.vector(trainIndex), ]
test_label <- test_raw$author == "realDonaldTrump"
# making sure train and test sets have the same features
test_dfm <- dfm_select(test_dfm, train_dfm)
# using quanteda's NB model
nb_model <- quanteda::textmodel_nb(train_dfm, train_labels)
nb_preds <- predict(nb_model, test_dfm)
# defining textmodel_nb as classification model
class(nb_model)
model_type.textmodel_nb_fitted <- function(x, ...) {
return("classification")
}
# a wrapper-up function for data preprocessing
get_matrix <- function(df){
corpus <- corpus(df)
dfm <- dfm(corpus, remove_url = TRUE, remove_punct = TRUE, remove = stopwords("english"))
}
then I define the explainer - no problems here:
explainer <- lime(train_raw[1:5],
model = nb_model,
preprocess = get_matrix)
But when I run an explainer, even on exactly same dataset as in explainer, I get an error:
explanation <- lime::explain(train_raw[1:5],
explainer,
n_labels = 1,
n_features = 6,
cols = 2,
verbose = 0)
Error in predict.textmodel_nb_fitted(x, newdata = newdata, type = type, :
feature set in newdata different from that in training set
Does it have something to do with quanteda and dfms? I honestly don't see why this should happen. Any help will be great, thanks!
We can trace the error to predict_model, which calls predict.textmodel_nb_fitted (I used only the first 10 rows of train_raw to speed up computation):
traceback()
# 7: stop("feature set in newdata different from that in training set")
# 6: predict.textmodel_nb_fitted(x, newdata = newdata, type = type,
# ...)
# 5: predict(x, newdata = newdata, type = type, ...)
# 4: predict_model.default(explainer$model, case_perm, type = o_type)
# 3: predict_model(explainer$model, case_perm, type = o_type)
# 2: explain.data.frame(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
# 1: lime::explain(train_raw[1:10, 1:5], explainer, n_labels = 1,
# n_features = 5, cols = 2, verbose = 0)
The problem is that predict.textmodel_nb_fitted expects a dfm, not a data frame. For example, predict(nb_model, test_raw[1:5]) gives you the same "feature set in newdata different from that in training set" error. However, explain takes a data frame as its x argument.
A solution is to write a custom textmodel_nb_fitted method for predict_model that does the necessary object conversions before calling predict.textmodel_nb_fitted:
predict_model.textmodel_nb_fitted <- function(x, newdata, type, ...) {
X <- corpus(newdata)
X <- dfm_select(dfm(X), x$data$x)
res <- predict(x, newdata = X, ...)
switch(
type,
raw = data.frame(Response = res$nb.predicted, stringsAsFactors = FALSE),
prob = as.data.frame(res$posterior.prob, check.names = FALSE)
)
}
This gives us
explanation <- lime::explain(train_raw[1:10, 1:5],
explainer,
n_labels = 1,
n_features = 5,
cols = 2,
verbose = 0)
explanation[1, 1:5]
# model_type case label label_prob model_r2
# 1 classification 1 FALSE 0.9999986 0.001693861
dI'm new to R and ML but have a focused question that I am trying to answer.
I'm using my own data but following Matt Dancho's example here to predict attrition: http://www.business-science.io/business/2017/09/18/hr_employee_attrition.html
I have removed zero variance and scaled variables as per his update.
My issue is running the explain() on explainer step. I get variations of both errors below (in bold) when I run the former original code and the latter variation. Everything else runs up to that point.
explanation <- lime::explain(
as.data.frame(test_h2o[1:10,-1]),
explainer = explainer,
n_labels = 1,
n_features = 4,
kernel_width = 0.5)
gives:
Error during wrapup: arguments imply differing number of rows: 50000, 0
While
explanation <- lime::explain(
as.data.frame(test_h2o[1:500,-1]),
explainer = explainer,
n_labels = 1,
n_features = 5,
kernel_width = 1)
Gives:
ERROR: Unexpected HTTP Status code: 500 Server Error (url = http://localhost:54321/3/PostFile?destination_frame=C%3A%2FUsers%2Fsim.s%2FAppData%2FLocal%2FTemp%2FRtmpykNkl1%2Ffileb203a8d4a58.csv_sid_afd3_26)
Error: lexical error: invalid char in json text.
<html> <head> <meta http-equiv=
(right here) ------^
Please let me know if you have any ideas or insights for this problem, or need additional info from me.
Try this and let me know what you get. Note that this assumes your excel file is stored in a folder called "data" in your working directory. Use getwd() and setwd() to get/set the working directory (or use Projects in RStudio IDE).
library(h2o) # Professional grade ML pkg
library(tidyquant) # Loads tidyverse and several other pkgs
library(readxl) # Super simple excel reader
library(lime) # Explain complex black-box ML models
library(recipes) # Preprocessing for machine learning
hr_data_raw_tbl <- read_excel(path = "data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx")
hr_data_organized_tbl <- hr_data_raw_tbl %>%
mutate_if(is.character, as.factor) %>%
select(Attrition, everything())
recipe_obj <- hr_data_organized_tbl %>%
recipe(formula = Attrition ~ .) %>%
step_rm(EmployeeNumber) %>%
step_zv(all_predictors()) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
prep(data = hr_data_organized_tbl)
hr_data_bake_tbl <- bake(recipe_obj, newdata = hr_data_organized_tbl)
h2o.init()
hr_data_bake_h2o <- as.h2o(hr_data_bake_tbl)
hr_data_split <- h2o.splitFrame(hr_data_bake_h2o, ratios = c(0.7, 0.15), seed = 1234)
train_h2o <- h2o.assign(hr_data_split[[1]], "train" ) # 70%
valid_h2o <- h2o.assign(hr_data_split[[2]], "valid" ) # 15%
test_h2o <- h2o.assign(hr_data_split[[3]], "test" ) # 15%
y <- "Attrition"
x <- setdiff(names(train_h2o), y)
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 15
)
automl_leader <- automl_models_h2o#leader
explainer <- lime::lime(
as.data.frame(train_h2o[,-1]),
model = automl_leader,
bin_continuous = FALSE
)
explanation <- lime::explain(
x = as.data.frame(test_h2o[1:10,-1]),
explainer = explainer,
n_labels = 1,
n_features = 4,
n_permutations = 500,
kernel_width = 1
)
explanation