Related
I have a training df with 2 columns like
a b
1 1000 20
2 1008 13
...
n ... ...
Now, as I am required to find a 95% CI for the estimate of 'b' based on a specific 'a' value, with a 'k' value of my choice and compare the CI result to other specific value of 'k's. My question is how can I perform bootstrap for this with 1000 bootstrap reps as I am required to use a fitted knn model for the training data with kernel = 'gaussian' and k can only be in range 1-20 ?
I have found that the best k for this model is k = 5, and had a go for bootstrap but it doesn't work
library(kknn)
library(boot)
boot.kn = function(formula, data, indices)
{
# Create a bootstrapped version
d = data[indices,]
# Fit a model for bs
fit.kn = fitted(train.kknn(formula,data, kernel= "gaussian", ks = 5))
# Do I even need this complicated block
target = as.character(fit.kn$terms[[2]])
rv = my.pred.stats(fit.kn, d[,target])
return(rv)
}
bs = boot(data=df, statistic=boot.kn, R=1000, formula=b ~ a)
boot.ci(bs,conf=0.95,type="bca")
Please inform me for more info if I'm not clear enough. Thank you.
Here is a way to regress b on a with the k-nearest neighbors algorithm.
First, a data set. This is a subset of the iris data set, keeping the first two columns. One row is removed to later be the new data.
i <- which(iris$Sepal.Length == 5.3)
df1 <- iris[-i, 1:2]
newdata <- iris[i, 1:2]
names(df1) <- c("a", "b")
names(newdata) <- c("a", "b")
Now load the packages to be used and determine the optimal value for k with package kknn.
library(caret)
library(kknn)
library(boot)
fit <- kknn::train.kknn(
formula = b ~ a,
data = df1,
kmax = 15,
kernel = "gaussian",
distance = 1
)
k <- fit$best.parameters$k
k
#[1] 9
And bootstrap predictions for the new point a <- 5.3.
boot.kn <- function(data, indices, formula, newdata, k){
d <- data[indices, ]
fit <- knnreg(formula, data = d)
predict(fit, newdata = newdata)
}
set.seed(2021)
R <- 1e4
bs <- boot(df1, boot.kn, R = R, formula = b ~ a, newdata = newdata, k = k)
ci <- boot.ci(bs, level = 0.95, type = "bca")
ci
#BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#Based on 10000 bootstrap replicates
#
#CALL :
#boot.ci(boot.out = bs, type = "bca", level = 0.95)
#
#Intervals :
#Level BCa
#95% ( 3.177, 3.740 )
#Calculations and Intervals on Original Scale
Plot the results.
old_par <- par(mfrow = c(2, 1),
oma = c(5, 4, 0, 0) + 0.1,
mar = c(1, 1, 1, 1) + 0.1)
hist(bs$t, main = "Histogram of bootstrap values")
abline(v = 3.7, col = "red")
abline(v = mean(bs$t), col = "blue")
abline(v = ci$bca[4:5], col = "blue", lty = "dashed")
plot(b ~ a, df1)
points(5.3, 3.7, col = "red", pch = 19)
points(5.3, mean(bs$t), col = "blue", pch = 19)
arrows(x0 = 5.3, y0 = ci$bca[4],
x1 = 5.3, y1 = ci$bca[5],
col = "blue", angle = 90, code = 3)
par(old_par)
I tried a Naive Bayes classifier to see if I can predict if a person, given their age and estimated salary, would purchase a particular vehicle or not. The plot I got in the visualisation section looks not very smooth and clean, with white lines running across my plot. I'm assuiming the graphics/resolution is the problem but I am not sure.
This is a snippet of what the dataset looks like
Age EstimatedSalary Purchased
19 19000 0
35 20000 0
26 43000 0
27 57000 0
19 76000 0
27 58000 0
Here is the code
# Loading the data set
data <- read.csv(" *A csv sheet on people's age, salaries and whether or not they will purchase a certain vehicle* ")
data <- data[, 3:5]
attach(data)
# Encoding the dependent variable
data$Purchased <- factor(data$Purchased, levels = c(0, 1))
attach(data)
# Splitting the dataset
library(caTools)
set.seed(404)
split <- sample.split(Purchased, SplitRatio = 0.75)
train_set <- subset(data, split == T)
test_set <- subset(data, split == F)
# Feature scaling
train_set[-3] <- scale(train_set[-3])
test_set[-3] <- scale(test_set[-3])
# Training the model
library(e1071)
classifier <- naiveBayes(x = train_set[-3], y = train_set$Purchased)
# Predicting test results
y_pred <- predict(classifier, newdata = test_set[-3])
# Construct the confusion matrix
(cm <- table(test_set[, 3], y_pred))
Below is the code that I used to visualise the results
# Visualising the results
library(ElemStatLearn)
set <- test_set
x1 <- seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
x2 <- seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set <- expand.grid(x1, x2)
colnames(grid_set) <- c("Age", "EstimatedSalary")
y_grid <- predict(classifier, newdata = grid_set)
plot(set[, -3], main = "Naive Bayes: Test set", xlab = "Age", ylab = "EstimatedSalary", xlim = range(x1), ylim = range(x2))
contour(x1, x2, matrix(as.numeric(y_grid), length(x1), length(x2)), add = T)
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, "green4", "red3"))
Naive Bayes classifier plot on the test set predictions
Would like to know the reason for the white lines running up and down the plot and why it does not look smooth?
So I figured out what was giving me the weird lines and the low quality resolution. Adding the "cex = n" parameter to the "points()" function in the graph with n = 5 solved this.
Revised block of code
set <- test_set
x1 <- seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
x2 <- seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set <- expand.grid(x1, x2)
colnames(grid_set) <- c("Age", "EstimatedSalary")
y_grid <- predict(classifier, newdata = grid_set)
plot(set[, -3], main = "Naive Bayes: Test set", xlab = "Age", ylab = "EstimatedSalary", xlim = range(x1), ylim = range(x2))
contour(x1, x2, matrix(as.numeric(y_grid), length(x1), length(x2)), add = T)
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"), cex = 5)
points(set, pch = 21, bg = ifelse(set[, 3] == 1, "green4", "red3"))
The revised line of code in the above block
points(grid_set, pch = ".", col = ifelse(y_grid == 1, "Springgreen3", "tomato"), cex = 5)
However the case, I would still like to know the reason behind how this happened because the explanation available in R about the functions and the parameters were not that clear to me.
Would appreciate any help given!
This question already exists:
R: Convert "grob" (graphical object) to "ggplot" [duplicate]
Closed 2 years ago.
I working with the R programming language. I am trying to convert a "grob" object into a "ggplot" object (the goal is eventually to convert the ggplot object into a "plotly" object).
I am looking for "the most simple" way to convert "grob" to "ggplot" - the computer I am using does not have a USB port or an internet connection, it only has R with some preloaded libraries (e.g. ggplot2, ggpubr)
In my example: I generated some data, ran a statistical model ("random forest") and plotted the results using "compressed" axis ("Tsne"). The code below can be copy/pasted into R, and the resulting "plot" ("final_plot") is the object that I want to convert to "ggplot":
library(cluster)
library(Rtsne)
library(dplyr)
library(randomForest)
library(caret)
library(ggplot2)
library(plotly)
#PART 1 : Create Data
#generate 4 random variables : response_variable ~ var_1 , var_2, var_3
var_1 <- rnorm(10000,1,4)
var_2<-rnorm(10000,10,5)
var_3 <- sample( LETTERS[1:4], 10000, replace=TRUE, prob=c(0.1, 0.2, 0.65, 0.05) )
response_variable <- sample( LETTERS[1:2], 10000, replace=TRUE, prob=c(0.4, 0.6) )
#put them into a data frame called "f"
f <- data.frame(var_1, var_2, var_3, response_variable)
#declare var_3 and response_variable as factors
f$response_variable = as.factor(f$response_variable)
f$var_3 = as.factor(f$var_3)
#create id
f$ID <- seq_along(f[,1])
#PART 2: random forest
#split data into train set and test set
index = createDataPartition(f$response_variable, p=0.7, list = FALSE)
train = f[index,]
test = f[-index,]
#create random forest statistical model
rf = randomForest(response_variable ~ var_1 + var_2 + var_3, data=train, ntree=20, mtry=2)
#have the model predict the test set
pred = predict(rf, test, type = "prob")
labels = as.factor(ifelse(pred[,2]>0.5, "A", "B"))
confusionMatrix(labels, test$response_variable)
#PART 3: Visualize in 2D (source: https://dpmartin42.github.io/posts/r/cluster-mixed-types)
gower_dist <- daisy(test[, -c(4,5)],
metric = "gower")
gower_mat <- as.matrix(gower_dist)
labels = data.frame(labels)
labels$ID = test$ID
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(labels$labels),
name = labels$ID)
plot = ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = labels$labels))
plotly_plot = ggplotly(plot)
a = tsne_obj$Y
a = data.frame(a)
data = a
data$class = labels$labels
decisionplot <- function(model, data, class = NULL, predict_type = "class",
resolution = 100, showgrid = TRUE, ...) {
if(!is.null(class)) cl <- data[,class] else cl <- 1
data <- data[,1:2]
k <- length(unique(cl))
plot(data, col = as.integer(cl)+1L, pch = as.integer(cl)+1L, ...)
# make grid
r <- sapply(data, range, na.rm = TRUE)
xs <- seq(r[1,1], r[2,1], length.out = resolution)
ys <- seq(r[1,2], r[2,2], length.out = resolution)
g <- cbind(rep(xs, each=resolution), rep(ys, time = resolution))
colnames(g) <- colnames(r)
g <- as.data.frame(g)
### guess how to get class labels from predict
### (unfortunately not very consistent between models)
p <- predict(model, g, type = predict_type)
if(is.list(p)) p <- p$class
p <- as.factor(p)
if(showgrid) points(g, col = as.integer(p)+1L, pch = ".")
z <- matrix(as.integer(p), nrow = resolution, byrow = TRUE)
contour(xs, ys, z, add = TRUE, drawlabels = FALSE,
lwd = 2, levels = (1:(k-1))+.5)
invisible(z)
}
model <- randomForest(class ~ ., data=data, mtry=2, ntrees=500)
#this is the final plot
final_plot = decisionplot(model, data, class = "class", main = "rf (1)")
From here, I am trying to convert this object ("final_plot") into a ggplot object:
library(ggpubr)
final = ggpubr::as_ggplot(final_plot)
But this gives me the following error:
Error in gList(...) : only 'grobs' allowed in "gList"
From here, I eventually would have wanted to use this command to convert the ggplot into a plotly object:
plotly_plot = ggplotly(final)
Does anyone know if there is a straightforward way to convert "final_plot" into a ggplot object? (and then plotly)? I don't have the ggplotify library.
Thanks
I am not practice in R and I need some help. I am an ecologist, I have a sites by variables matrix, where "TD0", "TD1", "TD2" are the response variables and "Chao", "age", "slope" the explanatory variables.
ID.plot TD0 TD1 TD2 Chao age slope
1 GS_Ci01N 20 8.898 6.488 0.6521390 26 2
2 GS_Ci03N 26 7.788 4.883 0.2335441 26 2
3 GS_Ci04N 31 10.482 7.282 0.5234748 26 0
4 GS_Ci05N 47 18.108 11.989 0.3110385 26 3
5 GS_Ci06N 47 16.332 10.107 0.4529010 26 0
6 GS_Ci07N 31 9.478 5.725 0.5524426 26 1
db.chao <- read.table(text=db.chao, header = TRUE)
I built boosted regression trees (BRT) to define the thresholds from explanatory-response shapes. I used "dismo" and "gbm" packages.
mod0 <- gbm.step(data=db.chao, gbm.x = 5:7, gbm.y = 2, family = "poisson", tree.complexity = 5, learning.rate = 0.0025, bag.fraction = 0.5)
mod1 <- gbm.step(data=db.chao, gbm.x = 5:7, gbm.y = 3, family = "gaussian", tree.complexity = 5, learning.rate = 0.0025, bag.fraction = 0.5)
mod2 <- gbm.step(data=db.chao, gbm.x = 5:7, gbm.y = 4, family = "gaussian", tree.complexity = 5, learning.rate = 0.0025, bag.fraction = 0.5)
I obtained three models:
"mod0" which describes the relation between TD0 and the explanatory variables
"mod1" which describes the relation between TD1 and the explanatory variables
"mod2" which describes the relation between TD2 and the explanatory variables
For each of them I built the panel graphs as below (these are just examples):
enter image description here
enter image description here
enter image description here
For each response variable I have three graphs, one for each explanatory variable.
I obtained them with this script:
gbm.plot(mod.TD0, n.plots = 3, write.title= FALSE, main = "TD0", rug = T, smooth = TRUE, plot.layout=c(1,3), common.scale = T)
gbm.plot(mod.TD1, n.plots = 3, write.title= FALSE, main = "TD1", rug = T, smooth = TRUE, plot.layout=c(1,3), common.scale = T)
gbm.plot(mod.TD2, n.plots = 3, write.title= FALSE, main = "TD2", rug = T, smooth = TRUE, plot.layout=c(1,3), common.scale = T)
Actually I would like to have three graphs, one for each explanatory variable and, after that if possible, in each of them I would like to overlap the three response variables shapes (with three different lines or colors).
I suppose I should use the "pdp" package for constructing partial dependence plots, but I'm not able to do this.
If someone could help me I would be grateful.
Thank you very much!
I'm not really sure how a gbm works and why it needs the number of trees to predict the output but here is a working example using the pdp and gridExtra packages:
library(pdp)
ntrees <- 250 # Number of trees to use to predict data
pred <- function(object, newdata) {
pred <- predict(object, newdata, n.trees = ntrees)
mean(pred)
}
pdps1 <- pdps2 <- pdps3 <- list()
for (i in 1:3) {
pdps1[[i]] <- partial(mod0, pred.var = names(db.chao)[i+4],
train = db.chao, plot = TRUE,
pred.fun = pred, recursive = F)
pdps2[[i]] <- partial(mod1, pred.var = names(db.chao)[i+4],
train = db.chao, plot = TRUE,
pred.fun = pred, recursive = F)
pdps3[[i]] <- partial(mod2, pred.var = names(db.chao)[i+4],
train = db.chao, plot = TRUE,
pred.fun = pred, recursive = F)
}
gridExtra::grid.arrange(grobs = pdps1, nrow = 1) # For the first model
gridExtra::grid.arrange(grobs = pdps2, nrow = 1) # For the second model
gridExtra::grid.arrange(grobs = pdps3, nrow = 1) # For the third model
Hope this helps!
EDIT
Following with the request of OP to obtain all pdps in only three plots and with different number of trees to predict the values:
library(pdp)
ntrees1 <- 150 # Number of trees to use to predict data with model1
ntrees2 <- 250 # Number of trees to use to predict data with model2
ntrees3 <- 50 # Number of trees to use to predict data with model3
pred1 <- function(object, newdata) {
pred <- predict(object, newdata, n.trees = ntrees1)
mean(pred)
}
pred2 <- function(object, newdata) {
pred <- predict(object, newdata, n.trees = ntrees2)
mean(pred)
}
pred3 <- function(object, newdata) {
pred <- predict(object, newdata, n.trees = ntrees3)
mean(pred)
}
# Function to obtain legend to plot later in grid.arrange
get_legend<-function(myggplot){
tmp <- ggplot_gtable(ggplot_build(myggplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)
}
# Obtain partial dependence data instead of plot
pdps1 <- pdps2 <- pdps3 <- list()
plotlist <- list()
for (i in 1:3) {
# Create local environment to prevent ggplot to overwrite the plots with the iterator
local({
i <- i
pdps1[[i]] <<- partial(mod0, pred.var = names(db.chao)[i+4],
train = db.chao, plot = FALSE,
pred.fun = pred1, recursive = F)
pdps2[[i]] <<- partial(mod1, pred.var = names(db.chao)[i+4],
train = db.chao, plot = FALSE,
pred.fun = pred2, recursive = F)
pdps3[[i]] <<- partial(mod2, pred.var = names(db.chao)[i+4],
train = db.chao, plot = FALSE,
pred.fun = pred3, recursive = F)
pdp <- rbind(pdps1[[i]],pdps2[[i]],pdps3[[i]])
pdp <- cbind(pdp,rep(c("y1","y2","y3"), each = nrow(pdps1[[i]])))
names(pdp)[3] <- "#output"
plotlist[[i]] <<- ggplot(pdp) +
geom_line(aes(x = pdp[,1], y = pdp[,2],
group = pdp[,3], color = pdp[,3])) +
xlab(names(pdp)[1]) + ylab("yhat") +
ggtitle(paste0("PDP of ",names(pdp)[1])) +
labs(color = "#output")
})
legend <- get_legend(plotlist[[i]])
plotlist[[i]] <- plotlist[[i]] + theme(legend.position = "none")
}
plotlist[[4]] <- legend
gridExtra::grid.arrange(grobs = plotlist, nrow = 1, widths=c(2.3, 2.3, 2.3, 0.8))
I am trying to use a Gaussian Process Regression (GPR) model to predict hourly streamflow discharges in a river. I've got good results applying the caret::kernlab train () function (thanks Kuhn!).
Since the uncertainty idea is one of the main inherent ones advantages of the GPR, I would like to know if anyone could help me to access the results related to the prediction inteval of the test dataset.
I'll put an extract of the code I've been working. Since my real data are huge (and sincerely, I don't know how to put it here), I'll example with the data(airquality). The main goal in this particular example is to predict airquality$Ozone, using as predictos the lag-variables of airquality$Temperature.
rm(list = ls())
data(airquality)
airquality = na.omit(as.data.frame(airquality)); str(airquality)
library(tidyverse)
library(magrittr)
airquality$Ozone %>% plot(type = 'l')
lines(airquality$Temp, col = 2)
legend("topleft", legend = c("Ozone", "Temperature"),
col=c(1, 2), lty = 1:1, cex = 0.7, text.font = 4, inset = 0.01,
box.lty=0, lwd = 1)
attach(airquality)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train = data.frame(df_lags[1:81, ]) # Training Observed 75% dataset
ESM_test = data.frame(df_lags[82:nrow(df_lags), ]) # Testing Observed 25% dataset
grid_gaussprRadial = expand.grid(.sigma = c(0.001, 0.01, 0.05, 0.1, 0.5, 1, 2)) # Sigma parameters searching for GPR
# TRAIN MODEL ############################
# Tuning set
library(caret)
set.seed(111)
cvCtrl <- trainControl(
method ="repeatedcv",
repeats = 1,
number = 20,
allowParallel = TRUE,
verboseIter = TRUE,
savePredictions = "final")
# Train (aprox. 4 seconds time-simulation)
attach(ESM_train)
set.seed(111)
system.time(Model_train <- caret::train(Ozone ~ Temp + Temp_lag1,
trControl = cvCtrl,
data = ESM_train,
metric = "MAE", # Using MAE since I intend minimum values are my focus
preProcess = c("center", "scale"),
method = "gaussprRadial", # Setting RBF kernel function
tuneGrid = grid_gaussprRadial,
maxit = 1000,
linout = 1)) # Regression type
plot(Model_train)
Model_train
ESM_results_train <- Model_train$resample %>% mutate(Model = "") # K-fold Training measures
# Select the interested TRAIN data and arrange them as dataframe
Ozone_Obs_Tr = Model_train$pred$obs
Ozone_sim = Model_train$pred$pred
Resid = Ozone_Obs_Tr - Ozone_sim
train_results = data.frame(Ozone_Obs_Tr,
Ozone_sim,
Resid)
# Plot Obs x Simulated train results
library(ggplot2)
ggplot(data = train_results, aes(x = Ozone_Obs_Tr, y = Ozone_sim)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "black")
# TEST MODEL ############################
# From "ESM_test" dataframe, we predict ESM Ozone time series, adding it in "ESM_forecasted" dataframe
ESM_forecasted = ESM_test %>%
mutate(Ozone_Pred = predict(Model_train, newdata = ESM_test, variance.model = TRUE))
str(ESM_forecasted)
# Select the interested TEST data and arrange them as a dataframe
Ozone_Obs = ESM_forecasted$Ozone
Ozone_Pred = ESM_forecasted$Ozone_Pred
# Plot Obs x Predicted TEST results
ggplot(data = ESM_forecasted, aes(x = Ozone_Obs, y = Ozone_Pred)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "black")
# Model performance #####
library(hydroGOF)
gof_TR = gof(Ozone_sim, Ozone_Obs_Tr)
gof_TEST = gof(Ozone_Pred,Ozone_Obs)
Performances = data.frame(
Train = gof_TR,
Test = gof_TEST
); Performances
# Plot the TEST prediction
attach(ESM_forecasted)
plot(Ozone_Obs, type = "l", xlab = "", ylab = "", ylim = range(Ozone_Obs, Ozone_Pred))
lines(Ozone_Pred , col = "coral2", lty = 2, lwd = 2)
legend("top", legend = c("Ozone Obs Test", "Ozone Pred Test"),
col=c(1, "coral2"), lty = 1:2, cex = 0.7, text.font = 4, inset = 0.01, box.lty=0, lwd = 2)
These last lines generate the following plot:
The next, and last, step would be to extract the prediction intervals, which is based on a gaussian distribution around each prediction point, to plot it together with this last plot.
The caret::kernlab train() appliance returned better prediction than, for instance, just kernlab::gaussprRadial(), or even tgp::bgp() packages. For both of them I could find the prediction interval.
For example, to pick up the prediction intervals via tgp::bgp(), it could be done typing:
Upper_Bound <- Ozone_Pred$ZZ.q2 #Ozone_Pred - 2 * sigma^2
Lower_Bound <- Ozone_Pred$ZZ.q1 #Ozone_Pred + 2 * sigma^2
Therefore, via caret::kernlab train(), I hope the required standard deviations could be found typing something as
Model_train$...
or maybe, with
Ozone_Pred$...
Moreover, at link: https://stats.stackexchange.com/questions/414079/can-mad-median-absolute-deviation-or-mae-mean-absolute-error-be-used-to-calc,
Stephan Kolassa author explained that we could estimate the prediction intervals through MAE, or even RMSE. But I didn't understand if this is my point, since the MAE I got is just the comparison between Obs x Predicted Ozone data, in this example.
Please, this solution is very important to me! I think I am near to obtain my main results, but I don't know anymore how to try.
Thanks a lot, friends!
I don't really know how the caret framework works, but getting a prediction interval for a GP regression with a Gaussian likelihood is easy enough to do manually.
First we just need a function for the squared exponential kernel, also called the radial basis function kernel, which is what you were using. sf here is the scale factor (unused in the kernlab implementation), and ell is the length scale, called sigma in the kernlab implementation:
covSEiso <- function(x1, x2 = x1, sf = 1.0, ell = 1.0) {
sf <- sf^2
ell <- -0.5 * (1 / (ell^2))
n <- nrow(x1)
m <- nrow(x2)
d <- ncol(x1)
result <- matrix(0, nrow = n, ncol = m)
for ( j in 1:m ) {
for ( i in 1:n ) {
result[i, j] <- sf * exp(ell * sum((x1[i, ] - x2[j, ])^2))
}
}
return(result)
}
I'm not sure what your code says about which length scale to use; below I will use a length scale of 25 and scale factor of 50 (obtained via GPML's hyperparameter optimization routines). Then we use the covSEiso() function above to get the relevant covariances, and the rest is application of basic Gaussian identities. I would refer you to Chapter 2 of Rasmussen and Williams (2006) (graciously provided for free online).
data(airquality)
library(tidyverse)
library(magrittr)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train <- data.frame(df_lags[1:81, ]) # Training Data 75% dataset
ESM_test <- data.frame(df_lags[82:nrow(df_lags), ]) # Testing Data 25% dataset
## For convenience I'll define separately the training and test inputs
X <- ESM_train[ , c("Temp", "Temp_lag1")]
Xstar <- ESM_test[ , c("Temp", "Temp_lag1")]
## Get the kernel manually
K <- covSEiso(X, ell = 25, sf = 50)
## We also need covariance between the test cases
Kstar <- covSEiso(Xstar, X, ell = 25, sf = 50)
Ktest <- covSEiso(Xstar, ell = 25, sf = 50)
## Now the 95% credible region for the posterior is
predictive_mean <- Kstar %*% solve(K + diag(nrow(K))) %*% ESM_train$Ozone
predictive_var <- Ktest - (Kstar %*% solve(K + diag(nrow(K))) %*% t(Kstar))
## Then for the prediction interval we only need to add the observation noise
z <- sqrt(diag(predictive_var)) + 25
interval_high <- predictive_mean + 2 * z
interval_low <- predictive_mean - 2 * z
Then we can check out the prediction intervals
This all is pretty easy to do via my gplmr package (available on GitHub) which can call GPML from R if you have Octave installed:
data(airquality)
library(tidyverse)
library(magrittr)
library(gpmlr)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train <- data.frame(df_lags[1:81, ]) # Training Data 75% dataset
ESM_test <- data.frame(df_lags[82:nrow(df_lags), ]) # Testing Data 25% dataset
X <- as.matrix(ESM_train[ , c("Temp", "Temp_lag1")])
y <- ESM_train$Ozone
Xs <- as.matrix(ESM_test[ , c("Temp", "Temp_lag1")])
ys <- ESM_test$Ozone
hyp0 <- list(mean = numeric(), cov = c(0, 0), lik = 0)
hyp <- set_hyperparameters(hyp0, "infExact", "meanZero", "covSEiso","likGauss",
X, y)
gp_res <- gp(hyp, "infExact", "meanZero", "covSEiso", "likGauss", X, y, Xs, ys)
predictive_mean <- gp_res$YMU
interval_high <- gp_res$YMU + 2 * sqrt(gp_res$YS2)
interval_low <- gp_res$YMU - 2 * sqrt(gp_res$YS2)
Then just plot the predictions, as above:
plot(NULL, xlab = "", ylab = "", xaxt = "n", yaxt = "n",
xlim = range(ESM_test$Temp), ylim = range(c(interval_high, interval_low)))
axis(1, tick = FALSE, line = -0.75)
axis(2, tick = FALSE, line = -0.75)
mtext("Temp", 1, 1.5)
mtext("Ozone", 2, 1.5)
idx <- order(ESM_test$Temp)
polygon(c(ESM_test$Temp[idx], rev(ESM_test$Temp[idx])),
c(interval_high[idx], rev(interval_low[idx])),
border = NA, col = "#80808080")
lines(ESM_test$Temp[idx], predictive_mean[idx])
points(ESM_test$Temp, ESM_test$Ozone, pch = 19)
plot(NULL, xlab = "", ylab = "", xaxt = "n", yaxt = "n",
xlim = range(ESM_test$Temp_lag1), ylim = range(c(interval_high, interval_low)))
axis(1, tick = FALSE, line = -0.75)
axis(2, tick = FALSE, line = -0.75)
mtext("Temp_lag1", 1, 1.5)
mtext("Ozone", 2, 1.5)
idx <- order(ESM_test$Temp_lag1)
polygon(c(ESM_test$Temp_lag1[idx], rev(ESM_test$Temp_lag1[idx])),
c(interval_high[idx], rev(interval_low[idx])),
border = NA, col = "#80808080")
lines(ESM_test$Temp_lag1[idx], predictive_mean[idx])
points(ESM_test$Temp_lag1, ESM_test$Ozone, pch = 19)