R - Narrow prediction intervals when forecasting with nnetar - r

I am trying to build a forecast using the nnetar function from the forecast package.
I get some fairly good forecast compared to other methods, but I am having trouble with it producing very narrow prediction intervals.
The dataset I am trying to forecast is weekly revenue data from an e-commerce with conversion rate and adspend as explanatory x variables (Xreg)
This is how my forecast looks like:
This is the code I have used to produce it:
fit_test <- nnetar(total_revenue_ts, size = 5, repeats = 200, xreg = xreg)
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, h=26)
autoplot(fit_test_fc) + autolayer(test_rev_ts$total)
This is the data I have used:
total_revenue_ts <- structure(c(429527.84912, 5107534.789265, 5334742.992202, 7062236.076739,
7376937.2329, 8843885.679834, 10312889.001099, 4743025.186331,
1063820.467744, 8647610.570788, 7615849.910587, 6950888.592508,
6858879.08066, 7207686.138817, 6552543.847104, 6286320.862515,
6387758.212579, 6267651.456223, 6166523.577491, 6517987.757523,
4032163.322867, 6774882.672302, 7280882.606489, 7042888.802793,
5864325.907872, 7614073.472534, 5702820.168872, 5993043.498666,
5748712.530684, 5781854.779949, 6514731.488613, 6200435.741256,
6716691.630149, 5671091.670532, 6849896.078633, 6412725.445233,
5820498.437424, 5140661.371894, 5543105.774292, 6498649.993838,
6832579.992745, 6363471.54561, 5764986.861829, 6479827.767348,
6082916.613222, 5654806.062709, 6250723.443025, 7021696.610899,
6878521.38167, 6605964.840134, 5860880.924163, 6027383.028358,
7271275.876805, 5788375.978398, 5952319.104294, 8700792.56985,
9387497.556219, 10628335.699833, 12300448.541447, 7624816.545391,
8041602.838183, 7340912.745611, 6475830.912185, 6511598.406008,
7670675.084654, 6597851.103698, 5992838.357045, 5782002.308393,
7591927.838791, 6316308.891923, 6024260.46223, 6099526.226113,
5341138.559686, 5959177.962052, 4614361.675905, 5649334.049846,
6774789.19439, 7823320.381864, 5941416.816392, 6576822.658397,
4949544.168466, 6394315.633561, 5432101.434962, 5971872.77196,
6375234.021085, 6776885.612781, 6381300.2023, 5376238.120971,
4654630.262986, 5404870.534346, 6616177.722868, 6627152.023493,
6566693.385556, 6687236.645467, 6473086.938295, 5478904.979073,
5884130.390298, 6219789.15664), .Tsp = c(2015.84615384615, 2017.71153846154,
52), class = "ts")
xreg <- structure(c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 5723.69, 5528.78, 6099.31, 13001.39, 6750.07,
6202.91, 6685.01, 5020, 5094.73, 2714.07, 9645.9, 8208.18, 6297.5,
8053.29, 0, 4418.27, 9393.52, 11139.19, 12678.08, 12493.18, 11242.28,
9617.09, 6959.37, 11716.52, 8464.61, 1499.14, 14538.86, 12080.69,
11905.71, 14405.72, 9077.05, 10362.49, 13776.75, 17620.9, 14767.2,
19511.98, 19747.72, 19885.44, 16810.46, 10618.04, 7494.02, 8166.45,
7503.29, 7955.54, 7971.87, 14520.84, 19219.74, 18824.67, 27216.48,
32030.82, 32007.76, 24153.88, 20472.33, 17617.01, 4.77806579193501,
5.7287751461365, 5.28098389402001, 5.02434789173682, 4.95184840012426,
5.64277441770686, 5.37984870432963, 5.3906432267007, 5.43849275673932,
5.6884135855546, 5.2709838799333, 5.41075942817439, 4.94887009216008,
4.95521307717827, 5.62734185585188, 5.51042637235732, 5.29807054480431,
5.52756845275268, 5.70969961018115, 5.54781801299907, 5.73014260988972,
5.99759204482959, 6.22750289793179, 5.93356463634027, 5.69127817622951,
5.57154841999638, 5.66114857960352, 5.72923212265929, 5.31293510374571,
5.35736716492903, 5.65568332596196, 5.74619318262752, 5.5954764989987,
5.34701430785202, 5.38617886178862, 6.0341348094332, 5.46323395671082,
5.33899929707969, 5.22135801253651, 5.65190410869423, 5.28112320474013,
4.80649483723496, 4.81842452314323, 5.00675102835432, 4.49345845605863,
3.82212461085761, 4.62551440329218, 3.79930173346953, 5.71101883613167,
6.40135958079592, 7.1027311558873, 4.0456548762572, 4.86275624624909,
3.68451118002285, 5.40269725877529, 5.24419134903069, 5.0344951706761,
4.89131058216232, 5.63214154072982, 5.52286515754452, 4.99781361730586,
5.09012974090091, 5.43346256247373, 5.20251523559131, 5.25889558131295,
4.17869474160865, 5.59036205822923, 5.33376848927069, 5.38868363783592,
5.43341024859593, 5.19857108205253, 5.19137882047327, 5.23814895237021,
5.01957530659338, 5.48137535816619, 5.67523044227311, 5.26029025707068,
5.18449109254837, 5.24915583751151, 5.45151430953043, 5.34584086799277,
4.97336938233212, 5.22618004090631, 5.52619366814479, 5.70389182510811,
5.75578084064244, 5.53339664450776, 5.16303263313334, 5.88409835642594,
5.56461936196381, 5.20891730381574, 5.21675833063733, 5.30279468609766,
5.22628072593614, 4.77056025260184, 4.72482746416563, 4.68623694730198,
5.07214098963881), .Dim = c(98L, 2L), .Dimnames = list(NULL,
c("adCost", "transactionsPerSession")), .Tsp = c(2015.84615384615,
2017.71153846154, 52), class = c("mts", "ts", "matrix"))
xreg_test <- structure(c(17617.01, 13526.88, 14836.89, 20358.16, 20416.79,
21635.72, 15456.3, 12569.27, 18673, 20591.58, 18922.52, 19658.27,
21371.37, 20921.06, 18846.68, 17315.48, 18569.47, 20276.32, 17932.33,
18405.48, 17566.76, 15605.29, 18694.58, 17082.73, 18291.26, 18211.78,
18252.98, 5.07214098963881, 4.9644513137558, 4.50735617759714,
3.42940249666707, 5.57244242550868, 6.85297018333131, 8.27499041424656,
5.64773791252811, 4.17746355274814, 4.78132627344352, 4.5212649754887,
4.16629173040583, 3.95132622368061, 4.2603550295858, 4.07247936849659,
3.98828918165935, 3.8364837584878, 4.32967453511229, 4.10479719434903,
3.88986772076209, 3.89750505731625, 4.02224223511425, 4.23119830350054,
3.54885240337703, 4.05530730967035, 4.46043036568541, 4.59654125314768
), .Dim = c(27L, 2L), .Dimnames = list(NULL, c("adCost", "transactionsPerSession"
)), .Tsp = c(2017.71153846154, 2018.21153846154, 52), class = c("mts",
"ts", "matrix"))
test_rev_ts$total <- structure(c(6219789.15664, 6207675.91913, 5375609.354946, 5970907.816396,
4905889.954914, 6003436.003269, 6311734.743992, 5771009.21678,
5284469.645259, 7228321.956032, 7070364.421462, 8978263.238038,
11173150.908703, 8212310.181272, 5336736.750351, 6918492.690826,
7807812.156676, 7025220.106499, 6539795.925754, 6734049.267568,
6736165.004623, 5775402.314813, 6083716.578991, 6441420.211984,
6269669.541568, 4968476.314634, 11122809.394872), .Tsp = c(2017.71153846154,
2018.21153846154, 52), class = "ts")
I would really appreciate if anyone could explain why I am getting so narrow prediction intervals and how to solve it.

Why are the prediction intervals so narrow?
By default, nnetar uses information from the in-sample residuals for the innovations used in prediction intervals. The residuals can be arbitrarily small depending on the complexity of your model. The documentation gives this warning:
Note that if the network is too complex and overfits the data, the
residuals can be arbitrarily small; if used for prediction interval
calculations, they could lead to misleadingly small values.
Related to that, your time series has 98 points and the model has 31 parameters. Furthermore, the data has a seasonal period of 52, and when using a seasonal lag you then effectively only have 46 data points to fit.
As a reference, the standard deviation of the nnetar residuals is roughly 4 times smaller than the residuals from auto.arima.
What to do about narrow prediction intervals?
There are a couple possibilities. To speed up the computation of these examples I decreased the number of models fitted (to repeats = 50) and the number of PI simulations (to npaths = 50) from your example. To factor out effects from these changes and RNG, consider the model below as the baseline:
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg)
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Provide better innovations for forecast to use
These will affect the intervals, but the mean forecast will remain the same.
Set innovations manually
If you have some external knowledge of more appropriate innovations
to use, you can provide them through the innov argument when
forecasting.
For example, say that you happen to know that the standard deviation
of the innovations should really 3 times larger than what the
residuals show. Then you can do:
set.seed(1234) fit_test <- nnetar(total_revenue_ts, size = 5, repeats
= 50, xreg = xreg)
## Set up new innovations for PI
res_sd <- sd(residuals(fit_test), na.rm=T)
myinnovs <- rnorm(nrow(xreg_test)*50, mean=0, sd=res_sd*3)
## fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50, innov = myinnovs)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Use out-of-sample values
You could estimate better innovations by using out-of-sample
residuals rather than the in-sample ones. The subset argument in
nnetar allows you to fit only part of the data. You could also use
the CVar function for cross-validation and grab the residuals from
there. This is an example using the latter:
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg)
## Set up new innovations for PI
fit_test_cv < CVar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg)
res_sd <- sd(fit_test_cv$residuals, na.rm=T)
myinnovs <- rnorm(nrow(xreg_test)*50, mean=0, sd=res_sd)
##
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = reg_test, npaths = 50, innov = myinnovs)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Control for overfitting
These modifications will affect your model in addition to the prediction intervals, so the mean prediction will change compared to the baseline.
Drop the seasonal lag
Your data has roughly 2 seasonal periods. Using a seasonal lag makes you lose a large fraction of it since the model needs the lagged values for fitting and forecasting. You could remove the seasonal component, perhaps adding extra lags to compensate. In the example below, by having more lags I'm increasing the number of parameters to 36, but "gaining" 49 points due to not having the seasonal lag.
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, p=3, P=0, size = 5, repeats = 50, xreg = xreg)
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Decrease the model complexity
As mentioned before, with 5 neurons you have 31 parameters. Dropping that number to, say, size=2 reduces the number of parameters to 13.
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, size = 2, repeats = 50, xreg = xreg)
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Use regularization
To compensate for model complexity, we can use the decay argument from nnet for regularization.
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg, decay = 1)
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
Bottom line
Several of these options can also be combined if appropriate for your uses, but at the end of the day it's important to keep in mind these are complex models and there's there's only so much you can do with ~100 data points.
Here's what a regularized model combined with out-of-sample residuals would look like:
set.seed(1234)
fit_test <- nnetar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg, decay = 0.1)
## Set up new innovations for PI
fit_test_cv <- CVar(total_revenue_ts, size = 5, repeats = 50, xreg = xreg, decay = 0.1)
res_sd <- sd(fit_test_cv$residuals, na.rm=T)
myinnovs <- rnorm(nrow(xreg_test)*50, mean=0, sd=res_sd)
##
fit_test_fc <- forecast(fit_test, PI=TRUE , xreg = xreg_test, npaths = 50, innov = myinnovs)
autoplot(fit_test_fc) + autolayer(test_rev_ts)
**Note that in the examples I assumed a normal distributions for the innovations and only varied the standard deviation, but they could just as well follow any other arbitrary distribution when manually adding them through the innov argument.

Related

Am I using xgboost() correctly (in R)?

I'm a beginner with machine learning (and also R). I've figured out how to run some basic linear regression, elastic net, and random forest models in R and have gotten some decent results for a regression project (with a continuous dependent variable) that I'm working on.
I've been trying to learning how to use the gradient boosting algorithm and, in particular, the xgboost() command. My results are way worse here, though, and I'm not sure why.
I was hoping someone could take a look at my code and see if there are any glaring errors.
# Create training data with and without the dependent variable
train <- data[1:split, ]
train.treat <- select(train, -c(y))
# Create test data with and without the dependent variable
test <- data[(split+1):nrow(data), ]
test.treat <- select(test, -c(y))
# Load the package xgboost
library(xgboost)
# Run xgb.cv
cv <- xgb.cv(data = as.matrix(train.treat),
label = train$y,
nrounds = 100,
nfold = 10,
objective = "reg:linear",
eta = 0.1,
max_depth = 6,
early_stopping_rounds = 10,
verbose = 0 # silent
)
# Get the evaluation log
elog <- cv$evaluation_log
# Determine and print how many trees minimize training and test error
elog %>%
summarize(ntrees.train = which.min(train_rmse_mean), # find the index of min(train_rmse_mean)
ntrees.test = which.min(test_rmse_mean)) # find the index of min(test_rmse_mean)
# The number of trees to use, as determined by xgb.cv
ntrees <- 25
# Run xgboost
model_xgb <- xgboost(data = as.matrix(train.treat), # training data as matrix
label = train$y, # column of outcomes
nrounds = ntrees, # number of trees to build
objective = "reg:linear", # objective
eta = 0.001,
depth = 10,
verbose = 0 # silent
)
# Make predictions
test$pred <- predict(model_xgb, as.matrix(test.treat))
# Plot predictions vs actual bike rental count
ggplot(test, aes(x = pred, y = y)) +
geom_point() +
geom_abline()
# Calculate RMSE
test %>%
mutate(residuals = y - pred) %>%
summarize(rmse = sqrt(mean(residuals^2)))
How does this look?
Also, one thing I don't get about xgboost() is why I have to take out the dependent variable from the dataset in the "data" option and then add it back in the "label" option. Why do we do this?
My dataset has 809 observations and 108 independent variables. Here is an arbitrary subset:
structure(list(year = c(2019, 2019, 2019, 2019), ht = c(74, 76,
74, 73), wt = c(223, 234, 215, 215), age = c(36, 29, 32, 24),
gp_l1 = c(16, 16, 11, 14), gp_l2 = c(7, 0, 16, 0), gp_l3 = c(16,
15, 16, 0), gs_l1 = c(16, 16, 11, 13), gs_l2 = c(7, 0, 16,
0), gs_l3 = c(16, 15, 16, 0), cmp_l1 = c(372, 430, 226, 310
), cmp_l2 = c(154, 0, 297, 0), cmp_l3 = c(401, 346, 364,
0), att_l1 = c(597, 639, 365, 486), y = c(8, 71.5, 26, 22
)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"
))
My RMSE from this xgboost() model is 31.7. Whereas my random forest and glmnet models give RMSEs around 13. The prediction metric I'm comparing to has RMSE of 15.5. I don't get why my xgboost() model does so much worse than my random forest and glmnet models.

Create normally distributed variables with a defined correlation in R

I am trying to create a data frame in R, with a set of variables that are normally distributed. Firstly, we only create the data frame with the following variables:
RootCause <- rnorm(500, 0, 9)
OtherThing <- rnorm(500, 0, 9)
Errors <- rnorm(500, 0, 4)
df <- data.frame(RootCuase, OtherThing, Errors)
In the second part, we're asked to redo the above, but with a defined correlation between RootCause and OtherThing of 0.5. I have tried reading through a couple of pages and articles explaining correlation commands in R, but I am afraid I am struggling with comprehending it.
Easy answer
Draw another random variable OmittedVar and add it to the other variables:
n <- 1000
OmittedVar <- rnorm(n, 0, 9)
RootCause <- rnorm(n, 0, 9) + OmittedVar
OtherThing <- rnorm(n, 0, 9) + OmittedVar
Errors <- rnorm(n, 0, 4)
cor(RootCause, OtherThing)
[1] 0.4942716
Other answer: use multivariate normal function from MASS package:
But you have to define the variance/covariance matrix that gives you the correlation you like (the Sigma argument here):
d <- MASS::mvrnorm(n = n, mu = c(0, 0), Sigma = matrix(c(9, 4.5, 4.5, 9), nrow = 2, ncol = 2), tol = 1e-6, empirical = FALSE, EISPACK = FALSE)
cor(d[,1], d[,2])
[1] 0.5114698
Note:
Getting a correlation other than 0.5 depends on the process; if you want to change it from 0.5, you'll change the details (from adding 1 * OmittedVar in the first strat or changing Sigma in the second strat). But you'll have to look up details on variance rulse of the normal distribution.

XGBoost (R) CV test vs. training error

I'll preface my question by saying that I am, currently, unable to share my data due to extremely strict confidentiality agreements surrounding it. Hopefully I'll be able to get permission to share the blinded data shortly.
I am struggling to get XGBoost trained properly in R. I have been following the guide here and am so far stuck on step 1, tuning the nrounds parameter. The results I'm getting from my cross validation aren't doing what I'd expect them to do leaving me at a loss for where to proceed.
My data contains 105 obervations, a continuous response variable (histogram in the top left pane of the image in the link below) and 16095 predictor variables. All of the predictors are on the same scale and a histogram of them all is in the top right pane of the image in the link below. The predictor variables are quite zero heavy with 62.82% of all values being 0.
As a separate set of test data I have a further 48 observations. Both data sets have a very similar range in their response variables.
So far I've been able to fit a PLS model and a Random Forest (using the R library ranger). Applying these two models to my test data set I've been able to predict and get a RMSE of 19.133 from PLS and 15.312 from ranger. In the case of ranger successive model fits are proving very stable using 2000 trees and 760 variables each split.
Returning to XGBoost, using the code below, I have been fixing all parameters except nrounds and using the xgb.cv function in the R package xgboost to calculate the training and test errors.
data.train<-read.csv("../Data/Data_Train.csv")
data.test<-read.csv("../Data/Data_Test.csv")
dtrain <- xgb.DMatrix(data = as.matrix(data.train[,-c(1)]),
label=data.train[,1])
# dtest <- xgb.DMatrix(data = as.matrix(data.test[,-c(1)]), label=data.test[,1]) # Not used here
## Step 1 - tune number of trees using CV function
eta = 0.1; gamma = 0; max_depth = 15;
min_child_weight = 1; subsample = 0.8; colsample_bytree = 0.8
nround=2000
cv <- xgb.cv(
params = list(
## General Parameters
booster = "gbtree", # Default
silent = 0, # Default
## Tree Booster Parameters
eta = eta,
gamma = gamma,
max_depth = max_depth,
min_child_weight = min_child_weight,
subsample = subsample,
colsample_bytree = colsample_bytree,
num_parallel_tree = 1, # Default
## Linear Booster Parameters
lambda = 1, # Default
lambda_bias = 0, # Default
alpha = 0, # Default
## Task Parameters
objective = "reg:linear", # Default
base_score = 0.5, # Default
# eval_metric = , # Evaluation metric, set based on objective
nthread = 60
),
data = dtrain,
nround = nround,
nfold = 5,
stratified = TRUE,
prediction = TRUE,
showsd = TRUE,
# early_stopping_rounds = 20,
# maximize = FALSE,
verbose = 1
)
library(ggplot)
plot.df<-data.frame(NRound=as.matrix(cv$evaluation_log)[,1], Train=as.matrix(cv$evaluation_log)[,2], Test=as.matrix(cv$evaluation_log)[,4])
library(reshape2)
plot.df<-melt(plot.df, measure.vars=2:3)
ggplot(data=plot.df, aes(x=NRound, y=value, colour=variable)) + geom_line() + ylab("Mean RMSE")
If this function does what I believe it is does I was hoping to see the training error decrease to a plateau and the test error to decrease then begin to increase again as the model overfits. However the output I'm getting looks like the code below (and also the lower figure in the link above).
##### xgb.cv 5-folds
iter train_rmse_mean train_rmse_std test_rmse_mean test_rmse_std
1 94.4494006 1.158343e+00 94.55660 4.811360
2 85.5397674 1.066793e+00 85.87072 4.993996
3 77.6640230 1.123486e+00 78.21395 4.966525
4 70.3846390 1.118935e+00 71.18708 4.759893
5 63.7045868 9.555162e-01 64.75839 4.668103
---
1996 0.0002458 8.158431e-06 18.63128 2.014352
1997 0.0002458 8.158431e-06 18.63128 2.014352
1998 0.0002458 8.158431e-06 18.63128 2.014352
1999 0.0002458 8.158431e-06 18.63128 2.014352
2000 0.0002458 8.158431e-06 18.63128 2.014352
Considering how well ranger works I'm inclined to believe that I'm doing something foolish and causing XGBoost to struggle!
Thanks
To tune your parameters you can use tuneParams. Here is an example
task = makeClassifTask(id = id, data = "your data", target = "the name of the column in your data of the y variable")
# Define the search space
tuning_options <- makeParamSet(
makeNumericParam("eta", lower = 0.1, upper = 0.4),
makeNumericParam("colsample_bytree", lower = 0.5, upper = 1),
makeNumericParam("subsample", lower = 0.5, upper = 1),
makeNumericParam("min_child_weight", lower = 3, upper = 10),
makeNumericParam("gamma", lower = 0, upper = 10),
makeNumericParam("lambda", lower = 0, upper = 5),
makeNumericParam("alpha", lower = 0, upper = 5),
makeIntegerParam("max_depth", lower = 1, upper = 10),
makeIntegerParam("nrounds", lower = 50, upper = 300))
ctrl = makeTuneControlRandom(maxit = 50L)
rdesc = makeResampleDesc("CV", iters = 3L)
learner = makeLearner("classif.xgboost", predict.type = "response",par.vals = best_param)
res = tuneParams(learner = learner,task = task, resampling = rdesc,
par.set = tuning_options, control = ctrl,measures = acc)
Of course you can play around with the intervals for your parameters. In the end res will contain the optimal set of parameters for your xgboost and then you can train your xgboost using this parameters. Keep in mind that you can choose other method except apart from cross-validation, try ?makeResampleDesc
I hope it helps

r caret train and predict after tuning

I'm currently working on a research paper.
My dataset consists of daily stock index data with a predictor that says UP or DOWN.
In total I use 20% of my dataset to tune the parameters. Based on the code below I found that I should use sigma = 0.05 and C = 25.
Now, I want to train 50% on my dataset based on these settings and make a prediction on the other 50% of the set.
Could anyone explain how I should do this? I can't find how I should proceed in caret's documentation.
### MODELS
## SetUp Training
trainControl <- trainControl(method="cv", index = fit_on, indexOut = pred_on, savePredictions = TRUE)
metric <- "Accuracy"
## Parameter tuning
# SVM
set.seed(1908)
grid_rf <- expand.grid(C = c(0.1, 0.25, 0.5, 1, 5, 10, 25, 50), sigma = c(0,025, 0.05, 0.1, 0.25, 0.5, 1, 1.5, 2, 2.5, 3, 3.5,4,4.5,5,7,5,10,15))
fit.svm <- train(class~., data=dataCombined, method="svmRadial", metric=metric, preProc=c("range"),trControl=trainControl, tuneGrid = grid_rf)
Many thanks!
Best,
Mathijs

Bayesian error-in-variables (total least squares) model in R using MCMCglmm

I am fitting some Bayesian linear mixed models using the MCMCglmm package in R. My data includes predictors that are measured with error. I'd therefore like to build a model that takes this into account. My understanding is that a basic mixed effects model in MCMCglmm will minimize error only for the response variable (as in ols regression). In other words, vertical errors will be minimized. I'd like to minimize errors orthogonal to the regression line/plane/hyperplane.
Is it possible to fit an error-in-variables (aka total least
squares) model using MCMCglmm or would I have to use JAGS/STAN to
do this?
Is it possible to do this with multiple predictors in the
same model (I have some models with 3 or 4 predictors, each measured
with error)?
If it is possible, how would I specify the model?
I've included a data set below, with a random variable height that is measured with error to illustrate a basic set up with MCMCglmm.
library(nlme)
library(MCMCglmm)
data(Orthodont)
set.seed(1234)
Orthodont$height <- c(rnorm(54, 170, 10), rnorm(54, 150, 10))
prior1 <- list(
B = list(mu = rep(0, 3), V = diag(1e+08, 3)),
G = list(G1 = list(V = 1, nu = 1, alpha.mu = 0, alpha.V = 1000)),
R = list(V = 1, nu = 0.002)
)
model1 <- MCMCglmm(
fixed = distance ~ height + Sex,
random = ~ Subject,
rcov = ~ units,
data = Orthodont,
family = "gaussian",
prior = prior1,
nitt = 1.1e+4,
thin = 10,
burnin = 1e+3,
verbose = FALSE
)
summary(model1)

Resources