Having a binary Classification problem:
how would be possible to get the Shap Contribution for variables for a Ranger model?
Sample data:
library(ranger)
library(tidyverse)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
I have tried with several libraries(DALEX, shapr, fastshap, shapper) but I didnt get any solution.
I wish getting some result like SHAPforxgboost for xgboost like:
the output of shap.values which is the shap contribution of variables
the shap.plot.summary
Good Morning!,
According to what I have found, you can use ranger() with fastshap() as following:
library(fastshap)
library(ranger)
library(tidyverse)
data(iris)
# Binary Dataset
df <- iris
df$Target <- if_else(df$Species == "setosa",1,0)
df$Species <- NULL
x <- df %>% select(-Target)
# Train Ranger Model
model <- ranger(
x = df %>% select(-Target),
y = df %>% pull(Target))
# Prediction wrapper
pfun <- function(object, newdata) {
predict(object, data = newdata)$predictions
}
# Compute fast (approximate) Shapley values using 10 Monte Carlo repetitions
system.time({ # estimate run time
set.seed(5038)
shap <- fastshap::explain(model, X = x, pred_wrapper = pfun, nsim = 10)
})
# Load required packages
library(ggplot2)
theme_set(theme_bw())
# Aggregate Shapley values
shap_imp <- data.frame(
Variable = names(shap),
Importance = apply(shap, MARGIN = 2, FUN = function(x) sum(abs(x)))
)
Then for example, for variable importance, you can do:
# Plot Shap-based variable importance
ggplot(shap_imp, aes(reorder(Variable, Importance), Importance)) +
geom_col() +
coord_flip() +
xlab("") +
ylab("mean(|Shapley value|)")
Also, if you want individual predictions, the following is possible:
# Plot individual explanations
expl <- fastshap::explain(model, X = x ,pred_wrapper = pfun, nsim = 10, newdata = x[1L, ])
autoplot(expl, type = "contribution")
All this information has been found in here, and there is more to it: https://bgreenwell.github.io/fastshap/articles/fastshap.html
Check the link and solve your doubts ! :)
I launched two R packages to perform such tasks: One is "kernelshap" (crunching), the other one is "shapviz" (plotting).
library(randomForest)
library(kernelshap)
Ilibrary(shapviz)
set.seed(1)
fit <- randomForest(Sepal.Length ~ ., data = iris,)
# bg_X is usually a small (50-200 rows) subset of the data
# Step 1: Calculate Kernel SHAP values
s <- kernelshap(fit, iris[-1], bg_X = iris)
# Step 2: Turn them into a shapviz object
sv <- shapviz(s)
# Step 3: Gain insights...
sv_importance(sv, show_numbers = TRUE)
sv_dependence(sv, v = "Petal.Length", color_var = "auto")
Related
I want to plot ROC curve for my prediction and have this error
Error: Format of predictions is invalid. It couldn't be coerced to a list.
I've already tried to find the answer on google, asked chatgpt but nothing helped.
Variables svm_p and test$stroke both have the same data type and the same length.
If you can think of better way to plot ROC curve feel free to share it.
The code is provided below:
library(caret)
library(dplyr)
library(tidyr)
library(gridExtra)
library(Amelia)
library(naniar)
library(tidyverse)
library(ROCR)
start_time <- Sys.time()
#
#
prepared_dataset <- read_csv('prepared_data.csv')
prepared_dataset <- prepared_dataset[1:500, ]
#
set.seed(42)
# # total number of observations
n_obs <- nrow(prepared_dataset)
#
# # shuffle the dataset randomly
permuted_rows <- sample(n_obs)
#
# # Randomly order data
stroke_shuffled <- prepared_dataset[permuted_rows,]
#
# # Identify row to split on
split <- round(n_obs * 0.8)
#
# # Create train
train <- stroke_shuffled[1:split,]
#
# # Create test
test <- stroke_shuffled[(split + 1):nrow(stroke_shuffled),]
test[, 13]
#check if train is really 80% of the original
nrow(train) / nrow(prepared_dataset)
train$stroke = factor(train$stroke, levels = c(0, 1))
test$stroke = factor(test$stroke, levels = c(0, 1))
myGrid <- expand.grid(
C = c(1, 10, 100),
sigma = c(0.1, 1, 10)
)
svm <- train(
stroke ~ .,
train,
method = "svmRadial",
tuneGrid = myGrid
)
print(svm)
# # accuracy
svm_p <- predict(svm, newdata=test)
confusionMatrix(table(svm_p, test$stroke))
end_time <- Sys.time()
end_time - start_time
pred <- prediction(svm_p, test$stroke)
perf <- performance(pred,"tpr","fpr")
plot(perf,colorize=TRUE)
I'm trying to implement functions from bayesplot package on a INLA object and a little unsure of how to draw from the posterior predictive distribution. I think I almost have it but rstan draws are more variable than the INLA ones.
In rstan, using the simplified example from bayesplot vignette I can:
library(bayesplot)
library(ggplot2)
library(rstanarm)
library(ggpubr)
library(tidyverse)
#rstan model set up
roaches$roach100 <- roaches$roach1 / 100 # pre-treatment number of roaches (in 100s)
fit_poisson <- stan_glm(y ~ roach100 + treatment + senior, offset = log(exposure2), family = poisson(link = "log"), data = roaches, seed = 1111, refresh = 0)
#In order to use the PPC functions from the bayesplot package we need a vector y of outcome values:
y <- roaches$y
#and a matrix yrep of draws from the posterior predictive distribution,
yrep_poisson <- posterior_predict(fit_poisson, draws = 500)
#then plot:
p1 <- bayesplot::ppc_dens_overlay(y, yrep_poisson[1:50, ])
p1
I want to replicate that plot on a INLA object. According to the bayesplot vignette you can do this as they have provided code to define a simple pp_check method that creates fitted model objects of class e.g. foo:
pp_check.foo <- function(object, type = c("multiple", "overlaid"), ...) {
type <- match.arg(type)
y <- object[["y"]]
yrep <- object[["yrep"]]
stopifnot(nrow(yrep) >= 50)
samp <- sample(nrow(yrep), size = ifelse(type == "overlaid", 50, 5))
yrep <- yrep[samp, ]
if (type == "overlaid") {
ppc_dens_overlay(y, yrep, ...)
} else {
ppc_hist(y, yrep, ...)
}
}
To use pp_check.foo we can just make a list with y and yrep components and give it class foo:
x <- list(y = rnorm(200), yrep = matrix(rnorm(1e5), nrow = 500, ncol = 200))
class(x) <- "foo"
#create plot above:
pp_check(x, type = "overlaid")
INLA
#create same model but in inla:
library(INLA)
fit_poisson_inla <- inla(y ~ roach100 + treatment + senior, offset = log(exposure2), data = roaches,
control.predictor = list(compute = T),
family = "poisson")
inla_object_name$marginals.fitted.values returns a posterior predictive distribution for each y:
fit_poisson_inla$marginals.fitted.values
#so to get distribution for first oberservation:
fitted.Predictor.1 <- fit_poisson_inla$marginals.fitted.values[[1]]
I think repeatedly sampling from this would give me what I need but there are only 75 values (dim(fitted.Predictor.1) per observation used to create this distribution when in reality I would want to be sampling from a full range of values. I think we can do this (section 4.3 here) by using inla.tmarginal using linear predictor:
fitted_dist <- fit_poisson_inla$marginals.linear.predictor
#should i have used "inla.rmarginal(n, marginal)"?
marginal_dist <- lapply(fitted_dist, function(y) inla.tmarginal(function(x) {exp(x)}, y)) %>% map(~ as.data.frame(.) %>% rename(., xx = x))
#resample 500 times
yrep_poisson_inla <- as.matrix(bind_rows(rerun(500, lapply(marginal_dist, function(x) sample(x$xx, 1)) %>% as.data.frame())))
#convert to class foo for pp_check
x <- list(y = y, yrep = yrep_poisson_inla[1:50, ])
class(x) <- "foo"
p2 <- pp_check(x, type = "overlaid")
#plot
ggarrange(p1, p2, ncol = 1, nrow = 2, labels = c("rstan", "inla sample"))
My question is how do I correctly get a matrix of draws from the posterior predictive distribution from this inla (fit_poisson_inla) object to pass into pp_check? yrep_poisson produces discrete values while yrep_poisson_inla produces continuous values. There is a lot more variation in the rstan draws than INLA (second plot). Is what I have done correct and this is just some sampling issue or is it an artifact of the different methods? In more complicated examples the differences could be substantial.
Thanks
I have a problem when using replicate to repeat the function.
I tried to use the bootstrap to fit
a quadratic model using concentration as the predictor and Total_lignin as the response and going to report an estimate of the maximum with a corresponding standard error.
My idea is to create a function called bootFun that essentially did everything within one iteration of a for loop. bootFun took in only the data set the predictor, and the response to use (both variable names in quotes).
However, the SD is 0, not correct. I do not know where is the wrong place. Could you please help me with it?
# Load the libraries
library(dplyr)
library(tidyverse)
# Read the .csv and only use M.giganteus and S.ravennae.
dat <- read_csv('concentration.csv') %>%
filter(variety == 'M.giganteus' | variety == 'S.ravennae') %>%
arrange(variety)
# Check the data
head(dat)
# sample size
n <- nrow(dat)
# A function to do one iteration
bootFun <- function(dat, pred, resp){
# Draw the sample size from the dataset
sample <- sample_n(dat, n, replace = TRUE)
# A quadratic model fit
formula <- paste0('resp', '~', 'pred', '+', 'I(pred^2)')
fit <- lm(formula, data = sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
max <- bootFun(dat = dat, pred = 'concentration', resp = 'Total_lignin' )
# Iterated times
N <- 5000
# Use 'replicate' function to do a loop
maxs <- replicate(N, max)
# An estimate of the max of predictor and corresponding SE
mean(maxs)
sd(maxs)
Base package boot, function boot, can ease the job of calling the bootstrap function repeatedly. The first argument must be the data set, the second argument is an indices argument, that the user does not set and other arguments can also be passed toit. In this case those other arguments are the predictor and the response names.
library(boot)
bootFun <- function(dat, indices, pred, resp){
# Draw the sample size from the dataset
dat.sample <- dat[indices, ]
# A quadratic model fit
formula <- paste0(resp, '~', pred, '+', 'I(', pred, '^2)')
formula <- as.formula(formula)
fit <- lm(formula, data = dat.sample)
# Derive the max of the value of concentration
max <- -fit$coefficients[2]/(2*fit$coefficients[3])
return(max)
}
N <- 5000
set.seed(1234) # Make the bootstrap results reproducible
results <- boot(dat, bootFun, R = N, pred = 'concentration', resp = 'Total_lignin')
results
#
#ORDINARY NONPARAMETRIC BOOTSTRAP
#
#
#Call:
#boot(data = dat, statistic = bootFun, R = N, pred = "concentration",
# resp = "Total_lignin")
#
#
#Bootstrap Statistics :
# original bias std. error
#t1* -0.4629808 -0.0004433889 0.03014259
#
results$t0 # this is the statistic, not bootstrapped
#concentration
# -0.4629808
mean(results$t) # bootstrap value
#[1] -0.4633233
Note that to fit a polynomial, function poly is much simpler than to explicitly write down the polynomial terms one by one.
formula <- paste0(resp, '~ poly(', pred, ',2, raw = TRUE)')
Check the distribution of the bootstrapped statistic.
op <- par(mfrow = c(1, 2))
hist(results$t)
qqnorm(results$t)
qqline(results$t)
par(op)
Test data
set.seed(2020) # Make the results reproducible
x <- cumsum(rnorm(100))
y <- x + x^2 + rnorm(100)
dat <- data.frame(concentration = x, Total_lignin = y)
I am having a problem using the predict() function for a mgcv::gam (training) model on a new (testing) dataset. The problem arises due to a mrf smooth I have integrated to account for the spatial nature of my data.
I use the following call to create my GAM model
## Run GAM with MRF
m <- gam(crime ~ s(district,k=nrow(traindata),
bs ='mrf',xt=list(nb=nbtrain)), #define MRF smooth
data = traindata,
method = 'REML',
family = scat(), #fit scaled t distribution
gamma = 1.4
)
where I predict the dependent variable crime using the neighbourhood structure, parsed into the model in the smooth term argument xt. The neighbourhood structure comes as a nb object that I created using the poly2nb() function.
Now, if I want to use predict() on a new testing dataset, I don't know how to pass the according neighbourhood structure into the call. Providing just the new data
pred <- predict.gam(m,newdata=testdata)
throws the following error:
Error in predict.gam(m, newdata = testdata) :
7, 16, 20, 28, 35, 36, 37, 43 not in original fit
Here's a full reproduction of the error using the Columbus dataset called from within R directly:
#ERROR REPRODUCTION
## Load packages
require(mgcv)
require(spdep)
require(dplyr)
## Load Columbus Ohio crime data (see ?columbus for details and credits)
data(columb.polys) #Columbus district shapes list
columb.polys <- lapply(columb.polys,na.omit) #omit NAs (unfortunate problem with the Columbus sample data)
data(columb) #Columbus data frame
df <- data.frame(district=numeric(0),x=numeric(0),y= numeric(0)) #Create empty df to store x, y and IDs for each polygon
## Extract x and y coordinates from each polygon and assign district ID
for (i in 1:length(columb.polys)) {
district <- i-1
x <- columb.polys[[i]][,1]
y <- columb.polys[[i]][,2]
df <- rbind(df,cbind(district,x,y)) #Save in df data.frame
}
## Convert df into SpatialPolygons
sp <- df %>%
group_by(district) %>%
do(poly=select(., x, y) %>%Polygon()) %>%
rowwise() %>%
do(polys=Polygons(list(.$poly),.$district)) %>%
{SpatialPolygons(.$polys)}
## Merge SpatialPolygons with data
spdf <- SpatialPolygonsDataFrame(sp,columb)
## Split into training and test sample (80/20 ratio)
splt <- sample(1:2,size=nrow(spdf),replace=TRUE,prob=c(0.8,0.2))
train <- spdf[splt==1,]
test <- spdf[splt==2,]
## Prepapre both samples and create NB objects
traindata <- train#data #Extract data from SpatialPolygonsDataFrame
testdata <- test#data
traindata <- droplevels(as(train, 'data.frame')) #Drop levels
testdata <- droplevels(as(test, 'data.frame'))
traindata$district <- as.factor(traindata$district) #Factorize
testdata$district <- as.factor(testdata$district)
nbtrain <- poly2nb(train, row.names=train$Precinct, queen=FALSE) #Create NB objects for training and test sample
nbtest <- poly2nb(test, row.names=test$Precinct, queen=FALSE)
names(nbtrain) <- attr(nbtrain, "region.id") #Set region.id
names(nbtest) <- attr(nbtest, "region.id")
## Run GAM with MRF
m <- gam(crime ~ s(district, k=nrow(traindata), bs = 'mrf',xt = list(nb = nbtrain)), # define MRF smooth
data = traindata,
method = 'REML', # fast version of REML smoothness selection; alternatively 'GCV.Cp'
family = scat(), #fit scaled t distribution
gamma = 1.4
)
## Run prediction using new testing data
pred <- predict.gam(m,newdata=testdata)
SOLUTION:
I finally found the time to update this post with the solution. Thanks to everyone for helping me out. Here is the code for implementing k-fold CV with a random training-testing split:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = data,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Generate test dataset
testdata <- data[data$weight==0,] #Select test data by weight
#Predict test data
pred <- predict(m,newdata=testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
I have one question criticism with the current solution, being that the full dataset was used to "train" the model meaning that the predictions are going to be biased since the testdata was used to train it.
This only requires a couple minor tweaks to fix:
#Apply k-fold cross validation
mses <- data.frame() #Create empty df to store CV squared error values
scores <- data.frame() #Create empty df to store CV R2 values
set.seed(42) #Set seed for reproducibility
k <- 10 #Define number of folds
#For loop for each fold
for (i in 1:k) {
# Create weighting column
data$weight <- sample(c(0,1),size=nrow(data),replace=TRUE,prob=c(0.2,0.8)) #0 Indicates testing sample, 1 training sample
#Generate training dataset
trainingdata <- data[data$weight == 1, ] #Select test data by weight
#Generate test dataset
testdata <- data[data$weight == 0, ] #Select test data by weight
#Run GAM with MRF
ctrl <- gam.control(nthreads = 6) #Set controls
m <- gam(crime ~ s(disctrict, k=nrow(data), bs = 'mrf',xt = list(nb = nb)), #define MRF smooth
data = trainingdata,
weights = data$weight, #Use only weight==1 observations (training)
method = 'REML',
control = ctrl,
family = scat(),
gamma = 1.4
)
#Predict test data
pred <- predict(m,newdata = testdata)
#Extract MSES
mses[i,1] <- mean((data$R_MeanDiff[data$weight==0] - pred)^2)
scores[i,1] <- summary(m)$r.sq
}
#Get average scores from each k-fold test
av.mse.GMRF <- mean(mses$V1)
av.r2.GMRF <- mean(scores$V1)
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))