Related
I want to creat a shap plot for feature importance, for GBM model:
ctrlCV = trainControl(method = 'repeatedcv', repeats = 5 , number = 10 , classProbs = TRUE , savePredictions = TRUE, summaryFunction = twoClassSummary )
gbmFit = train(CR~., data = training_set,
method = "gbm",
metric="ROC",
trControl = ctrlCV,
tuneGrid = gbmGRID,
verbose = FALSE)
however, all examples I found are for xgboost model, packages like SHAPforxgboost and shapr, not working for me. for example:
shap_values <- shap.values(xgb_model = gbm_fit, X_train = tarining_set)
produces and error:
error in `colnames<-`(`*tmp*`, value = c(colnames(x_train), "bias")) : attempt to set 'colnames' on an object with less than two dimensions
I need a plot like this:
How can I do that?
EDIT - my train set using dput():
structure(list(CR = c("nonComplete", "nonComplete", "nonComplete",
"nonComplete", "nonComplete", "nonComplete", "nonComplete", "nonComplete",
"nonComplete", "nonComplete"), gender = c(1, 0, 0, 0, 1, 0, 0,
1, 0, 1), CD4.T.cells = c(-0.0741098696855045, -0.094401270881699,
0.0410284948786532, -0.163302950330185, -0.0942478217207681,
-0.167314411991775, -0.118272811489486, -0.0366277340916379,
-0.0809646843667242, -0.140727850456348), CD8.T.cells = c(-0.178835447722468,
-0.253897294559596, -0.0372301980787381, -0.230579110769457,
-0.224125346052727, -0.196933050675633, -0.344608041139497, -0.0550538743643369,
-0.276178546845023, -0.235047665605314), T.helpers = c(-0.0384421660291032,
-0.0275306107582565, 0.186447606591857, -0.124972070102036, -0.15348122673842,
-0.106812144494277, -0.104757782473888, 0.0686746776877563, -0.0729755869081981,
-0.0783448555726869), NK.cells = c(-0.0924083910597563, -0.172356328661097,
-0.0172673823614314, 0.0280649471541352, -0.128925304635747,
-0.0875076743713435, -0.188649323737844, -0.0518877213975413,
-0.184546079512101, -0.100562282085102), Monocytes = c(-0.0680848706469295,
-0.173427291586957, -0.0106773958944477, -0.0015805672257001,
-0.0751114943036091, -0.0737177243152751, -0.211297995211542,
-0.0674023045286274, -0.149380203815874, -0.0352058106388986),
Neutrophils = c(-0.0391833488213571, -0.0275279418713283,
0.0156454755097513, 0.0285160860867748, -0.0633367938488132,
0.0252778805872529, -0.0827920017974784, 0.0432343965225797,
-0.0693846217599099, -0.0249227307025501), gd.T.Cells = c(-0.162246594987039,
-0.297759223265742, -0.0814825699645205, -0.0688779846190755,
-0.222281334925374, -0.264420103679214, -0.251924422671008,
-0.162709306032616, -0.292342418053931, -0.246818199922858
), Non.plasma.B.cells = c(-0.0384755654971015, -0.114370815587458,
0.161268251261644, -0.0571463865006043, -0.112851511342984,
-0.0822058328898433, -0.118367014322845, 0.114155959200915,
-0.0923514068231641, -0.115614038543851)), row.names = c("Pt1",
"Pt10", "Pt101", "Pt103", "Pt106", "Pt11", "Pt17", "Pt18", "Pt26",
"Pt27"), class = "data.frame")
I've faced this probelm before and for me it only worked for xgboost models. This should work for you, using the shapviz package:
library(shapviz)
shp = shapviz(model, X_pred = data.matrix(data[,-1]), X = data)
sv_waterfall(shp, row_id = 1)
sv_importance(shp, kind = 'beeswarm')
I am calculating optimum number of clusters. I used NbClust function to comput, but how it is showing too many missing value but i don't know, there are no missing values in my data.
it is showing that
"Error in NbClust(data = df, distance = "euclidean", min.nc = 2, max.nc = 20, :
The TSS matrix is indefinite. There must be too many missing values. The index cannot be calculated."
Data i am using
dput(read.csv("cluster.csv"))
df = structure(list(St = c("PE", "SU", "PA", "OC", "PE",
"AC", "PP", "RA"), NDDZ91 = c(0.253576604, 0.0551232,
-0.53169303, -0.533246481, -0.533634844, -0.529751216, -0.529751216,
2.349376982), NDDZ92 = c(0.4633855, 0.952926247, -0.905688982,
-0.908031282, 0.815565566, -0.904127448, -0.904127448, 1.390097848
), NDDZ94 = c(0.971257769, 0.602251213, -0.82539626, -0.831562179,
0.018490857, -0.826819164, -0.826819164, 1.718596929), NDDZ95 = c(2.428086592,
-0.050766856, -0.502772844, -0.503557157, -0.289546405, -0.502953839,
-0.502953839, -0.075535652), NDDZ96 = c(0.073650972, 0.482511184,
-0.669130113, -0.675742407, -0.675742407, -0.664721917, -0.09563249,
2.224807178), NDDZ97 = c(2.108725851, 0.193018074, -0.616096838,
-0.618190279, 0.782927149, -0.616096838, -0.616096838, -0.618190279
), NDDZ98 = c(0.422792635, 0.224274925, -0.66324044, -0.674453783,
-0.191577267, -0.670300693, -0.670300693, 2.222805316), NDDZ99 = c(-0.045504148,
0.621635607, -1.030110408, -1.033331082, 0.370677267, 0.370677267,
-1.028730119, 1.774685616), NDDZ103 = c(0.543822029, 1.4294128,
-0.862935822, -0.865183039, 0.206064797, -0.865183039, -0.863310358,
1.277312632), NDDZ105 = c(-0.242116717, -0.327002284, -0.599905416,
-0.602682046, 0.790140631, -0.602682046, -0.598715431, 2.18296331
), NDDZ106 = c(-0.394116657, 1.166937427, -1.070650174, -1.078708713,
0.81841561, -1.078708713, 0.81841561, 0.81841561), NDDZ107 = c(1.493844177,
0.766047601, -1.041282102, -1.04295136, 0.956552995, -0.043914579,
-1.044382153, -0.043914579), NDDZ112 = c(2.137032432, 0.085031825,
-0.601376567, -0.601897927, -0.601897927, -0.601153126, 0.785414418,
-0.601153126), NDDZ113 = c(-0.102481763, -0.288855624, -0.41345193,
-0.41414606, -0.414377436, -0.413220553, 2.45975392, -0.413220553
), NDDZ114 = c(0.100876842, 0.716344963, -0.756031568, -0.758896113,
0.173403417, -0.756850009, -0.756850009, 2.038002477), NDDZ115 = c(-0.058558995,
0.221455542, -0.509307832, -0.505965142, -0.510336352, -0.507765052,
-0.507765052, 2.378242882), NDDZ116 = c(1.377841856, 1.640112838,
-0.676090962, -0.676661736, -0.676947124, -0.67409325, -0.67409325,
0.359931628), NDDZ117 = c(2.177231217, 0.849368214, -0.539426784,
-0.539639833, -0.479549446, -0.53892967, -0.509594639, -0.41945906
), NDDZ119 = c(2.215308855, 0.141088501, -0.679450372, -0.680029439,
-0.106916185, -0.678099214, -0.678099214, 0.466197068), NDDZ122 = c(1.743810041,
0.768581504, -0.772598602, -0.773098804, -0.348192016, -0.772598602,
-0.772598602, 0.926695082), NDDZ123 = c(0.634144889, 1.11554263,
-0.833927192, -0.834643558, -0.021473135, -0.832255672, -0.832255672,
1.60486771)), class = "data.frame", row.names = c(NA, -8L))
Code work i have done so so far
rownames(df) = c(df$St)
df = df[,-1]
library(NbClust)
nbclust_out <- NbClust(
data = df,
distance = "euclidean",
min.nc = 2,
max.nc = 20,
method = "ward.D",
)
but this the error showed like this "Error in NbClust(data = df, distance = "euclidean", min.nc = 2, max.nc = 20, :
The TSS matrix is indefinite. There must be too many missing values. The index cannot be calculated."
max.nc is higher then the rows in your dataset, which might lead to your issue. Using other packages:
#remove factor column
df$St <- NULL
#scale df
df.scaled <- scale(df)
#scree plot
scree <- fviz_nbclust(df.scaled, FUNcluster = kmeans, method = "wss", k.max = 7)
#parallel analysis
paral <- fa.parallel(df.scaled, fa = "pc")
Based on the plots below I would suggest 3 clusters. But the parallel analysis gives the error that you have a ultra-heywood case in your dataset, and to examine your results carefully.
This is my first post here so I am not quite sure how to frame a question here but I will try my best
I am trying to forecast densities of daily exchange rates, I have chosen EUR/USD as my currency pair that I'd like to forecast. I am using GARCH models to do the forecast. I have done my coding using "rugarch" package. The code looks like this
> ex1 <- as.xts(DEXUSEU) #DEXUSEU is the daily data of exchange rates
> ex2 <- ex1[!is.na(ex1)]
> lex2 <- 100*log((ex2[2:Nd,])/(ex2[1:(Nd-1),])) #taking log differences, it
has 4496 observations
> model1=ugarchspec (variance.model = list(model = "sGARCH", garchOrder = c(1,
1), submodel = NULL, external.regressors = NULL, variance.targeting =
FALSE),mean.model = list(armaOrder = c(0, 0), include.mean = TRUE,
archm = FALSE, archpow = 1, arfima = FALSE, external.regressors = NULL,
archex = FALSE),distribution.model = "std") #specifying garch model with
student t distribution
> modelfit1=ugarchfit(model,data=lex2,out.sample = 2000) #fitting model
> modelroll1=ugarchroll (
model1, data=lex2, n.ahead = 1, forecast.length = 2000,
n.start = NULL, refit.every = 50, refit.window = c("rolling"),
window.size = NULL, solver = "hybrid", fit.control = list(),
solver.control = list(), calculate.VaR = TRUE, VaR.alpha = 0.01,
cluster = NULL, keep.coef = TRUE) #doing rolling window forecast
> plot(modelroll1,which=1)
Thats how the density forecast looks like, and i am quite sure that something is wrong here, it shouldn't look like this:
Can anybody please help and tell me what I did wrong. I can provide additional data/information if needed. I am just not sure what else to provide as this is my first post here. Any kind of help would be very much appreciated.
R version: 3.4.2
I'm using rugarch and mgarch to spec and fit model with DCC to my data. The model is generated successfully, however I'm unable to generate the plots. Here's a snippet of my code:
library(rugarch)
library(rmgarch)
da=read.table("d-msft3dx0113.txt",header=T)
MSFT.ret = da[,3]
GSPC.ret = da[,6]
MSFT.GSPC.ret = cbind(MSFT.ret,GSPC.ret)
garch11.spec = ugarchspec(mean.model = list(armaOrder = c(0,0)),
variance.model = list(garchOrder = c(1,1),
model = "sGARCH"),
distribution.model = "norm")
dcc.garch11.spec = dccspec(uspec = multispec( replicate(2, garch11.spec) ),
dccOrder = c(1,1),
distribution = "mvnorm")
dcc.fit = dccfit(dcc.garch11.spec, data = MSFT.GSPC.ret)
dcc.fcst = dccforecast(dcc.fit, n.ahead=100)
plot(dcc.fcst)
When I call for plot, I get this error:
plot(dcc.fcst)
Make a plot selection (or 0 to exit):
Conditional Mean Forecast (vs realized returns)
Conditional Sigma Forecast (vs realized |returns|)
Conditional Covariance Forecast
Conditional Correlation Forecast
EW Portfolio Plot with forecast conditional density VaR limits
Selection: 1
Error in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...) :
plot.new has not been called yet
I then give it a new plot area:
plot.new()
plot(dcc.fcst)
Which gives me this unhelpful plot:
Selection1Plot
I have the same question, too. I don't know why plot(dcc.fic) cannot work. So I do it manually to extract the correlation and covariance. rcov and rcor are two functions to extract what we need.
plot(rcov(dcc.fit)[1,2,], type = "l", col = "blue",
main = "Conditional Covariance", xlab = "Time",
ylab = "Covariance")
plot(rcor(dcc.fit)[1,2,], type = "l", col = "purple",
main = "Conditional Correlation", xlab = "Time",
ylab = "Correlation")
Code:
library(nnet)
library(caret)
#K-folds resampling method for fitting model
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
allowParallel = TRUE) #10 separate 10-fold cross-validations
nnetGrid <- expand.grid(decay = seq(0.0002, .0008, length = 4),
size = seq(6, 10, by = 2),
bag = FALSE)
set.seed(100)
nnetFitcv <- train(R ~ .,
data = trainSet,
method = "avNNet",
tuneGrid = nnetGrid,
trControl = ctrl,
preProc = c("center", "scale"),
linout = TRUE,
## Reduce the amount of printed output
trace = FALSE,
## Expand the number of iterations to find
## parameter estimates..
maxit = 2000,
## and the number of parameters used by the model
MaxNWts = 5 * (34 + 1) + 5 + 1)
Error:
Error in train.default(x, y, weights = w, ...) :
final tuning parameters could not be determined
In addition: Warning messages:
1: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
2: In train.default(x, y, weights = w, ...) :
missing values found in aggregated results
data:
dput(head(trainSet))
structure(list(fy = c(317.913756282, 365.006253069, 392.548100067,
305.350697829, 404.999341917, 326.558279739), fu = c(538.962896683,
484.423120589, 607.974981919, 566.461909098, 580.287855801, 454.178316794
), E = c(194617.707566, 181322.455065, 206661.286272, 182492.029532,
189867.929239, 181991.379749), eu = c(0.153782620813, 0.208857408687,
0.29933255604, 0.277013319499, 0.251278125174, 0.20012525805),
imp_local = c(1555.3450957, 1595.41614044, 763.56392418,
1716.78277731, 1045.72429616, 802.742305814), imp_global = c(594.038972858,
1359.48216529, 1018.89209367, 850.887850177, 1381.3557372,
1714.66351462), teta1c = c(0.033375064111, 0.021482368218,
0.020905367537, 0.006956337817, 0.034913536977, 0.03009770223
), k1c = c(4000921.55552, 4499908.41979, 9764999.26902, 9273400.46159,
6163057.88855, 12338543.5703), k2_2L = c(98633499.5682, 53562216.5496,
51597126.6866, 79496746.0098, 54060378.6334, 88854286.5457
), k2_3L = c(53752551.0262, 125020222.794, 124021434.482,
125817803.431, 75021821.6702, 35160224.288), k2_4L = c(56725106.5978,
126865701.893, 145764489.664, 64837586.8755, 49128911.0832,
70088564.0166), bmaxc = c(3481281.32908, 4393584.00639, 2614830.02391,
3128593.72039, 3179348.29527, 4274637.35956), dfactorc = c(2.5474729895,
2.94296926288, 2.79505551368, 2.47882735165, 2.46407943564,
1.41121223341), amaxc = c(73832.9746763, 99150.5068997, 77165.4338508,
128546.996471, 53819.0447533, 54870.9707106), teta1s = c(0.015467320192,
0.013675755546, 0.031668366149, 0.028898297322, 0.019211801086,
0.013349768955), k1s = c(5049506.54552, 11250622.6842, 13852560.5089,
18813117.5726, 18362782.7372, 14720875.0829), k2_ab1s = c(276542468.441,
275768806.723, 211613299.608, 264475187.749, 162043062.526,
252936228.465), k2_ab2s = c(108971516.033, 114017918.32,
248886114.151, 213529935.615, 236891513.077, 142986118.909
), k2_ab3s = c(33306211.9166, 28220338.4744, 40462423.2281,
23450400.4429, 46044346.1128, 23695405.2598), bmaxab1 = c(4763935.86742,
4297372.01966, 3752983.00638, 4861240.46459, 4269771.8481,
4162098.23435), bmaxab2 = c(1864128.647, 1789714.6047, 2838412.50704,
2122535.96812, 2512362.60884, 1176995.61871), ab1 = c(66.4926766666,
42.7771212442, 45.4212664748, 50.3764074404, 35.4792060556,
34.1116517971), ab2 = c(21.0285105309, 23.5869838719, 18.8524808986,
10.1121885612, 10.9695055644, 12.1154127169), dfactors = c(2.47803921947,
0.874644748155, 0.749837099991, 1.96711589185, 2.5407774352,
1.28554379333), teta1f = c(0.037308451805, 0.035718600749,
0.012495093438, 0.000815957999, 0.002155991091, 0.02579104469
), k1f = c(14790480.9871, 17223538.1853, 19930679.8931, 3524230.46974,
15721827.0137, 13599317.0371), k2f = c(55614283.976, 54695745.7762,
86690362.7036, 99857853.7312, 63119072.711, 37510791.5472
), bmaxf = c(2094770.19484, 3633133.51482, 1361188.05421,
2001027.51219, 2534273.6726, 3765850.14143), dfactorf = c(0.745459795314,
2.04869176933, 0.853221909609, 1.76652410119, 0.523675021418,
1.0808768613), k2b = c(1956.92858062, 1400.78738327, 1771.23607857,
1104.05501369, 1756.6767193, 1509.9294956), amaxb = c(38588.0915097,
35158.1672213, 25711.062782, 21103.1603387, 27230.6973685,
43720.3558889999), dfactorb = c(0.822346959126, 2.34421354848,
0.79990635332, 2.99070447299, 1.76373031599, 1.38640223249
), roti = c(16.1560390049, 12.7223971386, 6.43238062144,
15.882552267, 16.0836252663, 18.2734832893), rotmaxbp = c(0.235615453341,
0.343204895932, 0.370304533553, 0.488746319999, 0.176135112774,
0.46921999001), R = c(0.022186087, 0.023768855, 0.023911029,
0.023935705, 0.023655335, 0.022402726)), .Names = c("fy",
"fu", "E", "eu", "imp_local", "imp_global", "teta1c", "k1c",
"k2_2L", "k2_3L", "k2_4L", "bmaxc", "dfactorc", "amaxc", "teta1s",
"k1s", "k2_ab1s", "k2_ab2s", "k2_ab3s", "bmaxab1", "bmaxab2",
"ab1", "ab2", "dfactors", "teta1f", "k1f", "k2f", "bmaxf", "dfactorf",
"k2b", "amaxb", "dfactorb", "roti", "rotmaxbp", "R"), row.names = c(7L,
8L, 20L, 23L, 28L, 29L), class = "data.frame")
data has no equal rows or zero values or NaNs. Any help is appreciated.
I guess the problem is caused by MaxNWts, which is The maximum allowable number of weights. The value you gave is less than the weights for networks with size larger than 5 units. It should be at least:
MaxNWts = max(nnetGrid$size)*(ncol(trainSet) + output_neron)
+ max(nnetGrid$size) + output_neron
So, in your case, it should be at least MaxNWts = 10 * (34 + 1) + 10 + 1