Overlay decision boundary for random forests and boostings - r

I generate some random data and am trying to overlay a decision boundary based upon fitting using random forests and boosting. I can recreate the problem below. I generate the data, and using regression trees I can easily overlay the decision boundary using the following code:
library(tidyverse)
# set seed and generate some random data
set.seed(123)
Dat <- tibble(
x1 = rnorm(100),
x2 = rnorm(100)
) %>% mutate(y = as_factor(ifelse(x1^2 + x2^2 > 1.39, "A", "B")))
circlepts <- tibble(theta = seq(0, 2*pi, length = 100)) %>%
mutate(x = sqrt(1.39) * sin(theta), y = sqrt(1.39) * cos(theta))
# graph the data and draw the boundary
p <- ggplot(Dat) + geom_point(aes(x1, x2, color = y)) + coord_fixed() +
geom_polygon(data = circlepts, aes(x, y), color = "blue", fill = NA)
# convert character to binary inputs making classification easier
binVec = as.vector(Dat$y)
binVec[which(binVec =="A")] = 1
binVec[which(binVec == "B")] = 0
binVec = as.numeric(binVec)
Dat$y = binVec
# split the data up
datasplit <- initial_split(Dat, prop = 0.7)
training_set <- as_tibble(training(datasplit))
testing_set <- as_tibble(testing(datasplit))
tree_fit <- tree(y~ ., training_set)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(tree_fit)
# plot the data with the decision overlay of the tree fit
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
Now if I try doing so with random forests or gradient boosting, add_predictions doesn't cooperate that well...
rf_fit <- randomForest(y ~ ., data=training_set, mtry = 2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
##ERROR: Error in if (is.na(out.type)) stop("type must be one of 'response', 'prob', 'vote'") : argument is of length zero
And for gradient boosting:
fitBoost <- gbm(y ~ ., data= Dat, distribution = "gaussian",
n.trees = 1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50), x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(fitBoost)
### ERROR: Error in paste("Using", n.trees, "trees...\n") : argument "n.trees" is missing, with no default
It seems to be a very simple problem. Could someone help me out?

The following code works with your random forest:
training_set$y <- factor(training_set$y)
rf_fit <- randomForest(y ~ ., data=training_set, mtry=2, ntree=500)
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
modelr::add_predictions(rf_fit)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)
And here is the code for the gradient boosting machine:
fitBoost <- gbm(y ~ ., data=Dat, distribution="gaussian", n.trees=1000)
pred <- predict(fitBoost, newdata=training_set, n.trees=1000)
add_predictions2 <- function (data, model, var = "pred", type = NULL)
{
data[[var]] <- predict2(model, data, type = type)
data
}
predict2 <- function (model, data, type = NULL)
{
if (is.null(type)) {
stats::predict(model, data, n.trees=1000)
} else {
stats::predict(model, data, type = type, n.trees=1000)
}
}
grid <- crossing(x1 = modelr::seq_range(testing_set$x1, 50),
x2 = modelr::seq_range(testing_set$x1, 50)) %>%
add_predictions2(fitBoost)
p + geom_contour(data = grid, aes(x2, x1, z = as.numeric(pred)), binwidth = 1)

Related

How to plot the result of a regression prediction in R

I am beginning with ML in R, and I really like the idea of visualize the results of my calculations, I am wondering how to plot a Prediction.
library("faraway")
library(tibble)
library(stats)
data("sat")
df<-sat[complete.cases(sat),]
mod_sat_sal <- lm(total ~ salary, data = df)
new_teacher <- tibble(salary = 40)
predict(mod_sat_sal, new_teacher)
Expected result:
Data and Regression Model
data(sat, package = "faraway")
df <- sat[complete.cases(sat), ]
model <- lm(total ~ salary, data = df)
Method (1) : graphics way
# Compute the confidence band
x <- seq(min(df$salary), max(df$salary), length.out = 300)
x.conf <- predict(model, data.frame(salary = x),
interval = 'confidence')
# Plot
plot(total ~ salary, data = df, pch = 16, xaxs = "i")
polygon(c(x, rev(x)), c(x.conf[, 2], rev(x.conf[, 3])),
col = gray(0.5, 0.5), border = NA)
abline(model, lwd = 3, col = "darkblue")
Method (2) : ggplot2 way
library(ggplot2)
ggplot(df, aes(x = salary, y = total)) +
geom_point() +
geom_smooth(method = "lm")

Visualization of predict glm using multiple variables in R

I would like to use the following dataset to fit glm and visualize the predict().
y=c(-18.948,-19.007,-18.899,-19.022,-20.599,-19.778,-17.344,-20.265,-20.258,-19.886,-18.05,-19.824,-20.1,-20.508,-20.455,-16.573,-20.249,-20.205,-20.436,-16.358,-17.717,-19.794,-20.372,-19.944,-20.072,-19.889,-20.139,-19.132,-20.275,-19.953,-19.769,-20.2,-19.638,-17.419,-19.086,-18.347,-18.73,-18.872,-18.956,-19.28,-18.176,-19.036,-18.084,-20.11,-19.641,-19.656,-19.25,-18.68,-19.089,-18.969,-18.161,-17.603,-20.37,-19.233,-18.961,-19.083,-20.118,-19.795,-17.154,-16.75)
x1=c(9.698,9.583,9.356,9.326,9.438,9.733,8.803,8.973,9.141,9.044,8.788,9.377,9.26,10.186,9.035,9.569,9.431,9.09,8.776,9.117,9.393,9.408,9.307,8.868,8.398,8.407,9.364,9.074,8.444,9.122,10.11,7.81,9.777,6.472,9.521,8.92,9.341,9.446,9.08,8.071,8.047,8.019,7.419,9.022,9.981,9.337,9.989,10.013,9.31,10.843,8.337,9.103,6.438,9.372,9.071,8.749,9.016,8.181,9.284,8.44)
x2=c('S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S03','S04','S04','S04','S04','S04','S04','S06','S06','S06','S06','S06','S06','S06','S06','S07','S07','S07','S07','S07','S07','S07','S07','S07','S08','S08','S09','S09','S09','S09','S09','S09','S09','S10','S03','S03','S03','S04','S04','S07','S07')
x3=c('A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','A1','P1','P1','P1','P1','P1','P1','P1')
mydata <- data.frame(y,x1,x2,x3)
Fit glm Model:
myglm <- glm(y ~ x1+x2+x3+x1:x2, family="gaussian", data= mydata)
Prediction:
1). Extract the range of x1
min <- min(mydata$x1)
max <- max(mydata$x1)
2). Create a new data frame.x
Here comes the question:
How should I include x2 and x3 in the new.x?
new.x <- data.frame(
x1=seq(min, max, length=60),
x2= ???
x3= ???)
Then predict new.y with myglm:
new.y = predict(myglm, newdata=new.x, se.fit=TRUE)
Combine new.x and new.y:
addThese <- data.frame(new.x, new.y)
interval
addThese <- mutate(addThese,
d15N=exp(fit),
lwr=exp(fit-1.96*se.fit),
upr=exp(fit+1.96*se.fit))
3). Visualization of the original data points and the glm prediction smooth line added:
ggplot(addThese, aes(x1, fit))+
geom_point(shape=21, size=3)+
geom_smooth(data=addThese,
aes(ymin=lwr, ymax=upr),
stat='identity')
I'm still wondering if this is a right way to create new.data, but I'll give it a try. So with your data, slightly modifying your code:
myglm <- glm(y ~ x1 + x2 + x3 + x1:x2, family = gaussian, data = mydata)
minx <- min(mydata$x1)
maxx <- max(mydata$x1)
# create data with all combinations of x1, x2, x3
new.data <- expand.grid(x1 = seq(minx, maxx, length.out = 60),
x2 = unique(mydata$x2),
x3 = unique(mydata$x3)
)
# visualize data
data.frame(predict(myglm, newdata = new.data, se.fit = T)[1:2]) %>%
bind_cols(new.data) %>%
mutate(d15N = exp(fit), lwr = fit - 1.96 * se.fit, upr = fit + 1.96 * se.fit) %>%
ggplot(aes(x = x1, y = fit, colour = interaction(x2, x3))) +
geom_point(size = 1, alpha = .75, pch = 19, position = "jitter") +
geom_smooth(aes(ymin = lwr, ymax = upr), stat = "identity", alpha = .5) +
facet_wrap(~interaction(x2, x3, sep = " : "), nrow = 5) +
ggthemes::theme_few() +
labs(y = "Predicted value", x = bquote(x[1])) +
theme(legend.position = "none")

Multiple Regression lines in ggplot2

here is a test code and I don't understand why is not working as expected. Is a ggplot2 question, not an R one.
library(ggplot2)
K = 10
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
folds <- cut(seq(1, nrow(xy)), breaks = K, labels = FALSE)
p1 <- ggplot(xy, aes(x = xy$x, y = xy$yrand))+geom_point() +ggtitle ("Simple
x vs y plot with added random noise") + xlab("X") + ylab("Y")
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
p1 <- p1 + geom_line(data = trainData, aes(x = trainData$x, y = predict(lmTemp, newdata = trainData)))
}
p1
Now what I would like to see is a plot with 10 lines (the regression lines). But I only see one. Can someone help me out? Is the ggplot2 syntax that is wrong?
Thanks, Umberto
EDITED:
I marked the answer I got since it is a nice way of doing it. I just wanted to add a simple way of doing it preparing the datasets for the graph I wanted to create. I think this method is slightly easier to understand if you don't have so much R experience.
for(i in 1:K){
#Segement your data by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- xy[testIndexes, ]
trainData <- xy[-testIndexes, ]
lmTemp <- lm(yrand ~ x, data = trainData)
# Let's build a data set for the lines
fitLines <- rbind(fitLines, data.frame(rep(paste("set",i),nrow(trainData)),trainData[,1], predict(lmTemp, newdata = trainData)))
}
names(fitLines) <- c("set", "x","y")
p1 + geom_line(data = fitLines, aes(x = x, y = y, col = set))
And this is what you get
You could use the crossv_kfold()function from the modelr-package, and put your complete code into a "pipe-workflow":
library(modelr)
library(tidyverse)
x <- 1:100/100
y <- sapply (x, FUN= function(x) 1+x)
xy <- data.frame(x,y)
set.seed(1234)
xy$yrand <- xy$y + runif(100,min = -0.35, max = 0.5)
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions)) +
stat_smooth(aes(colour = .id), method = "lm", se = FALSE) +
geom_point(aes(y = yrand))
Putting the colour-aes inside the ggplot-call would also map the points to the groups:
xy %>%
crossv_kfold() %>%
mutate(
models = map(train, ~ lm(yrand ~ x, data = .x)),
predictions = map2(models, test, ~predict(.x, newdata = .y, type = "response"))
) %>%
select(-train, -test, -models) %>%
unnest() %>%
bind_cols(xy) %>%
ggplot(aes(x = x, y = predictions, colour = .id)) +
stat_smooth(, method = "lm", se = FALSE) +
geom_point(aes(y = yrand))

2 polynomial regressions in a ggplot() graph

This is my Dataset:
As you can see, there are two quantitative variables (X, Y) and 1 categorical variable (molar, with two factors: M1, M2).
I would like to represent in one single graph two polynomial regressions and their respective prediction intervals: one for the M1 factor and one for the M2 factor. Each polynomial regression has its own degree (M1 is a 4 degree polynomial regression, and M2 is a 6 degree).
I want to use ggplot() function (which is in package ggplot2 in R). I have actually performed this figure but with all data merged (I mean, with no distinction between factors). This is the code I used:
# Fit a linear model
m <- lm(Y ~ X+I(X^2)+I(X^3)+I(X^4), data = Dataset)
# cbind the predictions to Dataset
mpi <- cbind(Dataset, predict(m, interval = "prediction"))
ggplot(mpi, aes(x = X)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point(aes(y = Y)) +
geom_line(aes(y = fit), colour = "blue", size = 1)
With this result:
So, I would like to have two different-grade polynomial regressions (one for the M1 and one for the M2), taking into account their respective predictions intervals. Which would be the exact code?
UPDATE - New code! I run this code with no success:
M1=subset(Dataset,Dataset$molar=="M1",select=X:Y)
M2=subset(Dataset,Dataset$molar=="M2",select=X:Y)
M1.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M1",select=X:Y))
M2.R <- lm(Y ~ X +I(X^2)+I(X^3)+I(X^4),
data=subset(Dataset,Dataset$molar=="M2",select=X:Y))
newdf <- data.frame(x = seq(0, 1, c(408,663)))
M1.P <- cbind(data=subset(Dataset,Dataset$molar=="M1",select=X:Y), predict(M1.R, interval = "prediction"))
M2.P <- cbind(data=subset(Dataset,Dataset$molar=="M2",select=X:Y), predict(M2.R, interval = "prediction"))
p = cbind(as.data.frame(rbind(M1.P, M2.P)), f = factor(rep(1:2, c(408,663)), x = rep(newdf$x, 2))
mdf = with(Dataset, data.frame(x = rep(x, 2), y = c(subset(Dataset,Dataset$molar=="M1",select=Y), subset(Dataset,Dataset$molar=="M2",select=Y),
f = factor(rep(1:2, c(408,663))))
ggplot(mdf, aes(x = x, y = y, colour = f)) + geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
These are the messages I get now:
[98] WARNING: Warning in if (n < 0L) stop("wrong sign in 'by' argument") :
the condition has length > 1 and only the first element will be used
Warning in if (n > .Machine$integer.max) stop("'by' argument is much too small") :
the condition has length > 1 and only the first element will be used
Warning in 0L:n :
numerical expression has 2 elements: only the first used
Warning in if (by > 0) pmin(x, to) else pmax(x, to) :
the condition has length > 1 and only the first element will be used
[99] WARNING: Warning in predict.lm(M1.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[100] WARNING: Warning in predict.lm(M2.R, interval = "prediction") :
predictions on current data refer to _future_ responses
[101] ERROR: <text>
I think I am closer but still can't see it. Help!
Here is one way. If you have more than two models/levels in the factor you should look into code that will work over the levels of the factor and fit the models that way.
Anyway, first some dummy data:
set.seed(100)
x <- runif(100)
y1 <- 2 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) + rnorm(100)
y2 <- -1 + (0.3 * x) + (2.4 * x^2) + (-2.5 * x^3) + (3.4 * x^4) +
(-0.3 * x^5) + (2.4 * x^6) + rnorm(100)
df <- data.frame(x, y1, y2)
Fit our two models:
m1 <- lm(y1 ~ poly(x, 4), data = df)
m2 <- lm(y2 ~ poly(x, 6), data = df)
Now precict at some new locations x and stick it together with x and f, a factor indexing the model, into a tidy format:
newdf <- data.frame(x = seq(0, 1, length = 100))
p1 <- predict(m1, newdata = newdf, interval = "prediction")
p2 <- predict(m2, newdata = newdf, interval = "prediction")
p <- cbind(as.data.frame(rbind(p1, p2)), f = factor(rep(1:2, each = 100)),
x = rep(newdf$x, 2))
Melt the original data into tidy form
mdf <- with(df, data.frame(x = rep(x, 2), y = c(y1, y2),
f = factor(rep(1:2, each = 100))))
Draw the plot, using colour to distinguish the models/data
ggplot(mdf, aes(x = x, y = y, colour = f)) +
geom_point() +
geom_ribbon(data = p, aes(x = x, ymin = lwr, ymax = upr,
fill = f, y = NULL, colour = NULL),
alpha = 0.2) +
geom_line(data = p, aes(x = x, y = fit))
This gets us

How to graph my multiple linear regression model (caret)?

I have created an multiple linear regression model and would now like to plot it. But I can't seem to figure it out. Any help would be greatly appreciated! I used baruto to find the feature attributes and then used train() to get the model. When I try to plot model_lm I get the error:
There are no tuning parameters with more than 1 value.
Here is my code at what I have attempted so far:
rt_train <- rttotal2
rt_train$year <- NULL
#rt_train$box_office <- NULL
#impute na and address multicoliniearity
preproc <- preProcess(rt_train, method = c("knnImpute","center",
"scale"))
rt_proc <- predict(preproc, rt_train)
rt_proc$box_office <- rt_train$box_office
sum(is.na(rt_proc))
titles <- rt_proc$titles
rt_proc$titles <- NULL
#rt_train$interval <- as.factor(rt_train$interval)
dmy <- dummyVars(" ~ .", data = rt_proc,fullRank = T)
rt_transform <- data.frame(predict(dmy, newdata = rt_proc))
index <- createDataPartition(rt_transform$interval, p =.75, list = FALSE)
train_m <- rt_transform[index, ]
rt_test <- rt_transform[-index, ]
str(rt_train)
y_train <- train_m$box_office
y_test <-rt_test$box_office
train_m$box_office <- NULL
rt_test$box_office <- NULL
#selected feature attributes
boruta.train <- Boruta(interval~., train_m, doTrace =1)
#graph to see most important var to interval
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
plot(boruta.train, xlab = "", xaxt = "n")
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)
#get most important attributes
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
getSelectedAttributes(final.boruta, withTentative = F)
boruta.rt_df <- attStats(final.boruta)
boruta.rt_df
boruta.rt_df <- setDT(boruta.rt_df, keep.rownames = TRUE)[]
predictors <- boruta.rt_df %>%
filter(., decision =="Confirmed") %>%
select(., rn)
predictors <- unlist(predictors)
control <- trainControl(method="repeatedcv",
number=10,
repeats=6)
#look at residuals
#p-value is very small so reject H0 that predictors have no effect so
#we can use rotten tomatoes to predict box_office ranges
train_m$interval <- NULL
model_lm <- train(train_m[,predictors],
y_train, method='lm',
trControl = control, tuneLength = 10)
model_lm #.568
#
plot(model_lm)
plot(model_lm)
z <- varImp(object=model_lm)
z <- setDT(z, keep.rownames = TRUE)
z$model <- NULL
z$calledFrom <- NULL
row.names(z)
plot(varImp(object=model_lm),main="Linear Model Variable Importance")
predictions<-predict.train(object=model_lm,rt_test[,predictors],type="raw")
table(predictions)
#get coeff
interc <- coef(model_lm$finalModel)
slope <- coef(model_lm$finalModel)
ggplot(data = rt_train, aes(y = box_office)) +
geom_point() +
geom_abline(slope = slope, intercept = interc, color = 'red')
This is what some of my input looks like. Thank you!!
Here is an example using the inbuilt data set cars:
data(cars, package = "datasets")
library(caret)
build the model
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 6)
model_lm <- train(dist ~ speed, data = cars, method='lm',
trControl = control, tuneLength = 10)
I will assume you would like to plot the final model.
You can use the caret predict.train function to get the predictions from the model and plot them:
pred <- predict(model_lm, cars)
pred <- data.frame(pred = pred, speed = cars$speed)
additionally you can provide the cars data set to geom point and plot the observations:
library(ggplot2)
ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x=speed, y = dist))
if you would like to obtain the confidence or prediction interval you can use the predict.lm function on model_lm$finalModel:
Here is an example for the prediction interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "prediction")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_int <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
or the confidence interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "confidence")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_conf <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
plotting them side by side:
library(cowplot)
plot_grid(pred_int, pred_conf)
to plot the linear dependence on two variables you can use a 3D plot, for more than 3 it will be a problem.

Resources