Looking for an efficient way to plot trees in rstudio, H2O's Flow or in local html page from h2o's RF and GBM models similar to the one in the image in link below. Specifically, how do you plot trees for the objects, (fitted models) rf1 and gbm2 produced by code below perhaps by parsing h2o.download_pojo(rf1) or h2o.download_pojo(gbm1)?
# # The following two commands remove any previously installed H2O packages for R.
# if ("package:h2o" %in% search()) { detach("package:h2o", unload=TRUE) }
# if ("h2o" %in% rownames(installed.packages())) { remove.packages("h2o") }
# # Next, we download packages that H2O depends on.
# pkgs <- c("methods","statmod","stats","graphics","RCurl","jsonlite","tools","utils")
# for (pkg in pkgs) {
# if (! (pkg %in% rownames(installed.packages()))) { install.packages(pkg) }
# }
#
# # Now we download, install h2o package
# install.packages("h2o", type="source", repos=(c("http://h2o-release.s3.amazonaws.com/h2o/rel-turchin/3/R")))
library(h2o)
h2o.init(nthreads = -1, max_mem_size = "2G")
h2o.removeAll() ##clean slate - just in case the cluster was already running
## Load data - available to download from link below
## https://www.dropbox.com/s/gu8e2o0mzlozbu4/SampleData.csv?dl=0
df <- h2o.importFile(path = normalizePath("../SampleData.csv"))
splits <- h2o.splitFrame(df, c(0.4, 0.3), seed = 1234)
train <- h2o.assign(splits[[1]], "train.hex")
valid <- h2o.assign(splits[[2]], "valid.hex")
test <- h2o.assign(splits[[2]], "test.hex")
predictor_col_start_pos <- 2
predictor_col_end_pos <- 169
predicted_col_pos <- 1
rf1 <- h2o.randomForest(training_frame = train, validation_frame = valid,
x = predictor_col_start_pos:predictor_col_end_pos, y = predicted_col_pos,
model_id = "rf_covType_v1", ntrees = 2000, stopping_rounds = 10, score_each_iteration = T,
seed = 2001)
gbm1 <- h2o.gbm(training_frame = train, validation_frame = valid, x = predictor_col_start_pos:predictor_col_end_pos,
y = predicted_col_pos, model_id = "gbm_covType2", seed = 2002, ntrees = 20,
learn_rate = 0.2, max_depth = 10, stopping_rounds = 2, stopping_tolerance = 0.01,
score_each_iteration = T)
## Next step would be to plot trees for fitted models rf1 and gbm2
# print the model, POJO (Plain Old Java Object) to screen
h2o.download_pojo(rf1)
h2o.download_pojo(gbm1)
I think it may be the solution you are looking for;
library(h2o)
h2o.init()
df = h2o.importFile("http://s3.amazonaws.com/h2o-public-test-data/smalldata/airlines/allyears2k_headers.zip")
model = h2o.gbm(model_id = "model",
training_frame = df,
x = c("Year", "Month", "DayofMonth", "DayOfWeek", "UniqueCarrier"),
y = "IsDepDelayed",
max_depth = 3,
ntrees = 5)
h2o.download_mojo(model, getwd(), FALSE)
Now download the latest stable h2o release from http://www.h2o.ai/download/ and run the PrintMojo tool from the command line.
java -cp h2o.jar hex.genmodel.tools.PrintMojo --tree 0 -i model.zip -o model.gv
dot -Tpng model.gv -o model.png
open model.png
More info: http://docs.h2o.ai/h2o/latest-stable/h2o-genmodel/javadoc/index.html
New Tree API introduced in 3.22.0.1 (October 2018) changes the whole game of visualizing H2O trees. General workflow may look like this:
and detailed example with code can be found here: Finally, You Can Plot H2O Decision Trees in R.
Related
I'm trying to save all the models from an h2o.automl as part of the h2o package. Currently I am able to save a single model using h2o.saveModel(aml#leader, path = "/home/data/user").
How can I save all the models?
Here is my attempt on a sample dataset:
library(h2o)
h2o.init()
prostate.hex <- h2o.importFile(path = paste("https://raw.github.com",
"h2oai/h2o-2/master/smalldata/logreg/prostate.csv", sep = "/"),
destination_frame = "prostate.hex")
Get data from github or import via readr:
library(readr)
prostate <- read_csv("/home/data/user/prostate.csv")
prostate.hex<- as.h2o(prostate, "prostate.hex")
aml <- h2o.automl(y = "CAPSULE", x = c("AGE","RACE","PSA","DCAPS"),
training_frame = prostate.hex,
max_runtime_secs = 180,
exclude_algos = c("StackedEnsemble")
)
Now I'm trying to save the models within aml:
mod_ids <- as_tibble(aml#leaderboard$model_id)
Now I can't figure out how to save the models:
for(i in 1:nrow(mod_ids)) {
print(mod_ids[i,])
#h2o.saveModel(object = aml#leaderboard[[i]], "/home/data/user/")
}
Here is what I've tried:
tutorial automl
H2O AUTOML: How to save reuse and build on top of existing automl models
Try this, it'll do your job:
for(i in 1:nrow(mod_ids)) {
aml1 <- h2o.getModel(aml#leaderboard[i, 1]) # get model object in environment
h2o.saveModel(object = aml1, "C:/Users/sm/Documents/stack/models") # pass that model object to h2o.saveModel as an argument
}
I'm trying to get leave-one-out predicted values. Please help me with this "can't find object" issue. I have searched for similar issues, but haven't managed to figure it out. This is on Windows 10.
Thanks in advance
library('gamlss')
library('foreach')
library('doParallel')
registerDoParallel(cores = 4)
# Generate data
set.seed(314)
sample.size <- 30
input.processed.cut <- data.frame(TP = round(runif(sample.size) * 100),
FP = round(runif(sample.size) * 100),
x = runif(sample.size))
# Fit Beta-binomial
model3 <- gamlss(formula = cbind(TP, FP) ~ x,
family = BB,
data = input.processed.cut)
# Get the leave-one-out values
loo_predict.mu <- function(model.obj, input.data) {
yhat <- foreach(i = 1 : nrow(input.data), .packages="gamlss", .combine = rbind) %dopar% {
updated.model.obj <- update(model.obj, data = input.data[-i, ])
predict(updated.model.obj, what = "mu", newdata = input.data[i,], type = "response")
}
return(data.frame(result = yhat[, 1], row.names = NULL))
}
par.run <- loo_predict.mu(model3, input.processed.cut)
# Error in { : task 1 failed - "object 'input.data' not found"
> version
_
platform x86_64-w64-mingw32
arch x86_64
os mingw32
system x86_64, mingw32
status
major 3
minor 4.3
year 2017
month 11
day 30
svn rev 73796
language R
version.string R version 3.4.3 (2017-11-30)
nickname Kite-Eating Tree
I got a response from gamlss team and verified that their solution works. The only thing to change was to provide "data" along with "newdata" to predict().
loo_predict.mu <- function(model.obj, input.data) {
yhat <- foreach(i = 1 : nrow(input.data), .packages="gamlss", .combine = rbind) %dopar% {
updated.model.obj <- update(model.obj, data = input.data[-i, ])
predict(updated.model.obj, what = "mu", data = input.data[-i, ],
newdata = input.data[i,], type = "response")
}
return(data.frame(result = yhat[, 1], row.names = NULL))
}
I am trying to tune an xgboost model with a multiclass dependent variable in R. I am using MLR to do this, however I run into an error where xgboost doesn't have predict within its namespace - which I assume MLR wants to use. I have had a look online and see that other people have encountered similar issues. However, I can't entirely understand the answers that have been provided (e.g. https://github.com/mlr-org/mlr/issues/935), when I try to implement them the issue persists. My code is as follows:
# Tune parameters
#create tasks
train$result <- as.factor(train$result) # Needs to be a factor variable for makeClass to work
test$result <- as.factor(test$result)
traintask <- makeClassifTask(data = train,target = "result")
testtask <- makeClassifTask(data = test,target = "result")
lrn <- makeLearner("classif.xgboost",predict.type = "response")
# Set learner value and number of rounds etc.
lrn$par.vals <- list(
objective = "multi:softprob", # return class with maximum probability,
num_class = 3, # There are three outcome categories
eval_metric="merror",
nrounds=100L,
eta=0.1
)
# Set parameters to be tuned
params <- makeParamSet(
makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 3L,upper = 10L),
makeNumericParam("min_child_weight",lower = 1L,upper = 10L),
makeNumericParam("subsample",lower = 0.5,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1)
)
# Set resampling strategy
rdesc <- makeResampleDesc("CV",stratify = T,iters=5L)
# search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#parallelStartSocket(cpus = detectCores()) # Enable parallel processing
mytune <- tuneParams(learner = lrn
,task = traintask
,resampling = rdesc
,measures = acc
,par.set = params
,control = ctrl
,show.info = T)
The specific error I get is:
Error: 'predict' is not an exported object from 'namespace:xgboost'
My package versions are:
packageVersion("xgboost")
[1] ‘0.6.4’
packageVersion("mlr")
[1] ‘2.8’
Would anyone know what I should do here?
Thanks in advance.
I'm trying to train a boosting model on a data frame, using the Caret and gbm packages in R. I've been able to build models successfully with default parameters; however, I continue to hit this error, when I attempt to customize the summary function:
Error in vector(type, length) :
vector: cannot make a vector of mode 'NULL'.
This is the first question I've posted, as I'm usually able to root up info to solve the problem. In this case, I can't seem to find a similar issue.
The following code is intended to reproduce the error. Let me know if it doesn't, or if I should include additional info, as I'm more than happy to do so.
System.info:
sysname: Windows
release: 7 x64
version: build 7601, Service Pack 1
version.string: R version 3.1.3 (2015-03-09)
system: x86_64, mingw32
library(plyr)
library(caret)
library(dplyr)
example <- data.frame(response = rnorm(100), predictor1 = rnorm(100), predictor2 = rnorm(100))
aeSummary <- function(data, lev = NULL, model = NULL) {
out <- abs(data$obs-data$pred)
names(out) <- "AE"
out
}
modelFit <- train(response ~ .,
data = example,
method = "gbm",
tuneGrid = data.frame(n.trees = 5,
interaction.depth = 5,
shrinkage = 0.05,
n.minobsinnode = 6),
metric = "AE",
maximize = FALSE,
trControl = trainControl(
summaryFunction = aeSummary))
I am trying to use R2WinBUGS using this example:
code
(Please only consider the part: ### 5.4. Analysis using WinBUGS)
I am getting this error message:
Error in file(con, "wb") : cannot open the connection
In addition: Warning messages:
1: In file.create(to[okay]) :
cannot create file 'c:/Program Files/WinBUGS14//System/Rsrc/Registry_Rsave.odc', reason 'Permission denied'
2: In file(con, "wb") :
cannot open file 'c:/Program Files/WinBUGS14//System/Rsrc/Registry.odc': Permission denied
Warning message:
running command '"c:/Program Files/WinBUGS14//WinBUGS14.exe" /par "D:/R2WinBUGS/normal/script.txt"' had status 1
>
I am not sure whether this is crucial for correct functionality (everything else seems to look ok). Is there a way to get rid of this?
Thanks.
Christian
PS:
This is the R code:
library(R2WinBUGS)
setwd("D:/R2WinBUGS/normal")
y10 <- rnorm(n = 10, mean = 600, sd = 30) # Sample of 10 birds
y1000 <- rnorm(n = 1000, mean = 600, sd = 30) # Sample of 1000 birds
# Save BUGS description of the model to working directory
sink("model.txt")
cat("
model {
# Priors
population.mean ~ dunif(0,5000) # Normal parameterized by precision
precision <- 1 / population.variance # Precision = 1/variance
population.variance <- population.sd * population.sd
population.sd ~ dunif(0,100)
# Likelihood
for(i in 1:nobs){
mass[i] ~ dnorm(population.mean, precision)
}
}
",fill=TRUE)
sink()
# Package all the stuff to be handed over to WinBUGS
# Bundle data
win.data <- list(mass = y1000, nobs = length(y1000))
# Function to generate starting values
inits <- function()
list (population.mean = rnorm(1,600), population.sd = runif(1, 1, 30))
# Parameters to be monitored (= to estimate)
params <- c("population.mean", "population.sd", "population.variance")
# MCMC settings
nc <- 3 # Number of chains
ni <- 1000 # Number of draws from posterior (for each chain)
nb <- 1 # Number of draws to discard as burn-in
nt <- 1 # Thinning rate
# Start Gibbs sampler: Run model in WinBUGS and save results in object called out
out <- bugs(data = win.data, inits = inits, parameters.to.save = params, model.file = "model.txt",
n.thin = nt, n.chains = nc, n.burnin = nb, n.iter = ni, debug = TRUE, DIC = TRUE, working.directory = getwd())
ls()
out # Produces a summary of the object
names(out)
str(out)
hist(out$summary[,8]) # Rhat values in the eighth column of the summary
which(out$summary[,8] > 1.1) # None in this case
par(mfrow = c(3,1))
matplot(out$sims.array[1:999,1:3,1], type = "l")
matplot(out$sims.array[,,2] , type = "l")
matplot(out$sims.array[,,3] , type = "l")
par(mfrow = c(3,1))
matplot(out$sims.array[1:20,1:3,1], type = "l")
matplot(out$sims.array[1:20,,2] , type = "l")
matplot(out$sims.array[1:20,,3] , type = "l")
par(mfrow = c(3,1))
hist(out$sims.list$population.mean, col = "grey")
hist(out$sims.list$population.sd, col = "blue")
hist(out$sims.list$population.variance, col = "green")
par(mfrow = c(1,1))
plot(out$sims.list$population.mean, out$sims.list$population.sd)
pairs(cbind(out$sims.list$population.mean, out$sims.list$population.sd, out$sims.list$population.variance))
summary(out$sims.list$population.mean)
summary(out$sims.list$population.sd)
sd(out$sims.list$population.mean)
sd(out$sims.list$population.sd)
summary(lm(y1000 ~ 1))
Probably it is windows UAC fault. By default UAC doesn't allow programms to write in almost anything except the user's folder. You can change that by running R as administrator. But I think that will change the library folder unless it is hardcoded in Renviron.site (inside R\etc folder), but I'm not 100% sure about that.
I was able to fix the problem by defining "bugs.directory".
out <- bugs(data = win.data, inits = inits, parameters.to.save = params, model.file = "model.txt", n.thin = nt, n.chains = nc, n.burnin = nb, n.iter = ni, debug = FALSE, DIC = TRUE, working.directory = getwd(), bugs.directory = 'c:/WinBUGS14')
Your link goes out to a huge file that spans many chapters of a book. In the comments section it says:
# You may have to add a 'working.directory' argument to calls to
# the function bugs().
Have you done that yet? There's also a bunch of user-specific stuff like:
setwd("C:/_Marc Kery/_WinBUGS book/Naked code") # May have to adapt that
Have you appropriately modified those items?