NbClust with one cluster - r

I have a dataset that I split by specific parameters and run NbClust function to calculate optimal number of clusters. Every once in a while there is only one cluster and NbClust breaks with
Error in sample.int(m, k) : cannot take a sample larger than the population when 'replace = FALSE'
Is there a generic workaround? Thanks!!
Data are attached:
df1 = structure(c(-0.01400863, -0.01400863, 0.00712136, 0.01377456,
0.00712136, 0, 0, -0.00636396, 0, 0.00636396), .Dim = c(5L, 2L
))
nb = NbClust(data = df1, diss = NULL, distance = "euclidean",
min.nc = 2, max.nc = 10, method = "kmeans",alphaBeale = 0)
EDIT (8/15/2018). I have found a solution to the problem. It simply bypasses the NbClust checks if nrow(unique(df1)) < 5 since max.nc must be at least min.nc + 2.
n_clusters = nrow(unique(df1))
if (nrow(unique(df1)) > 4) {
nb = NbClust(data = df1, diss = NULL, distance = "euclidean",
min.nc = 2, max.nc = min(nrow(unique(df1)),10), method = "kmeans",alphaBeale = 0)
n_clusters = max(unlist(nb[4]))
print(n_clusters)
}
clusters = kmeans(df1,n_clusters)

Related

R noob: running a simple 7-variable CSV through a bvarsv model

My simple code is yielding: Error in dimnames<-.data.frame(*tmp*, value = list(n)) :
invalid 'dimnames' given for data frame
Any help appreciated
library(bvarsv)
library(tidyverse)
library(janitor)
library(readxl)
set.seed(1)
test = read_excel("Desktop/test.csv")
bvar.sv.tvp(test, p = 2, tau = 40, nf = 10, pdrift = TRUE, nrep = 50000,
nburn = 5000, thinfac = 10, itprint = 10000, save.parameters = TRUE,
k_B = 4, k_A = 4, k_sig = 1, k_Q = 0.01, k_S = 0.1, k_W = 0.01,
pQ = NULL, pW = NULL, pS = NULL)
Edit:
The documentation specifies:
Y - Matrix of data, where rows represent time and columns are
different variables.Y must have at least two columns.
So when you read in your dataset, time will be a column at first, meaning you have to transform the dataframe that the time column will be your rownames. (Maybe you also want to use the lubridate package to parse your time column first).
tst <- read.csv("Desktop/tst.csv", TRUE, ",")
# tst_df <- data.frame(tst) # Should not be necassary
rownames(tst_df) <- tst_df[,1]
tst_df[,1] <- NULL
bvar.sv.tvp(tst_df, ...)
You can also the usmacro dataset as an example to see how the input data of bvar.sv.tvp() should look like.
data(usmacro)
print(usmacro)
Original Post:
I don't know how your csv looks like. So it is hard to tell what the actual issue is.
But you can try wrapping your data in "as.data.frame(test)" like this:
bvar.sv.tvp(as.data.frame(test), p = 2, tau = 40, nf = 10, pdrift = TRUE, nrep = 50000,
nburn = 5000, thinfac = 10, itprint = 10000, save.parameters = TRUE,
k_B = 4, k_A = 4, k_sig = 1, k_Q = 0.01, k_S = 0.1, k_W = 0.01,
pQ = NULL, pW = NULL, pS = NULL)

Prp plot - Coloring positive and negative values differently

I am fitting regression trees via the function rpart(). Given my data, I am going to have both positive and negative estimates in nodes. Is there a way to color them differently?
In particular, what I would like to have is a tree whose nodes are shaded in blue for negative values and in red for positive values, where darker colors signal stronger absolute values.
I attach a minimal reproducible example.
library(rpart)
library(rpart.plot)
# Simulating data.
set.seed(1986)
X = matrix(rnorm(2000, 0, 1), nrow = 1000, ncol = 2)
epsilon = matrix(rnorm(1000, 0, 0.01), nrow = 1000)
y = X[, 1] + X[, 2] + epsilon
dta = data.frame(X, y)
# Fitting regression tree.
my.tree = rpart(y ~ X1 + X2, data = dta, method = "anova", maxdepth = 3)
# Plotting.
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3)
# Repeating, with median(y) != 0.
X = matrix(rnorm(2000, 5, 1), nrow = 1000, ncol = 2)
epsilon = matrix(rnorm(1000, 0, 0.01), nrow = 1000)
y = X[, 1] + X[, 2] + epsilon
dta = data.frame(X, y)
my.tree = rpart(y ~ X1 + X2, data = dta, method = "anova", maxdepth = 3)
# HERE I NEED HELP!
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3)
As far as I understood, thanks to the box.palette option, I obtained the result I need in the first setting because median(y) is close to zero.
Indeed, in the second setting I am unhappy: I get blue shades for values less than median(y), and red shades for those above such value. How can I impose zero as the threshold for the two colors?
To be more specific, I would like a command that automatically ensures the two-colors system in any tree.
Ook, I answered my own question. The solution is actually quite simple: if the box.palette option is a two-color diverging palette (as in my example), we can use pal.thresh to set the threshold we want. In my case:
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3,
pal.thresh = 0) # HERE THE SOLUTION!
Even if this is probably bad for me, I will leave here the answer for future users and close the question, rather than deleting it.

SVAR estimation using vars package: scoring algorithm error

I am trying to estimate an A/B SVAR model using the scoring algorithm from the vars-package, but I get the following error:
mA = matrix(data = c(1,0,0, 0,1, -avg_elas ,-IVoutput[2,1] , -IVoutput[3,1], 1),
nrow = 3, ncol = 3, byrow = TRUE)
mB = matrix(data = c(NA, NA, 0, 0, NA, 0, 0, 0, NA), nrow = 3, ncol = 3, byrow = TRUE)
svar = SVAR(BPvar, estmethod = c('scoring'), Amat = mA, Bmat = mB , max.iter = 1000,)
Error in [<-(*tmp*, pos[i], i, value = 1) : subscript out of bounds
In addition: Warning message:
In grid.Call(C_getGPar) : reached elapsed time limit
It seems like a pretty simple error, but I can't figure out what it is related to. When I instead use estmethod = 'direct' i don't get an error but I would prefer to use the scoring method as I am trying to replicate a study.

Unused argument in K Means

I would like to do a cluster analysis with Kmeans and use the Euclidean distance.
This is part of my code:
WKA_ohneJB <- read.csv("WKA_ohneJB.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
WKA_ohneJB_scaled <- scale(WKA_ohneJB)
set.seed (123)
WKA_ohneJB_sample <- sample(1:500, 300)
WKA_ohneJB_scaled <- WKA_ohneJB_scaled[WKA_ohneJB_sample,]
kmeans(WKA_ohneJB_scaled, 8, iter.max = 10, nstart = 1, method = "euclidean")
fviz_nbclust(WKA_ohneJB_scaled, kmeans, method = "wss")+ geom_vline(xintercept = 8, linetype = 2)
Error in kmeans(WKA_ohneJB_scaled, 8, iter.max = 10, nstart = 1,
method = "euclidean") : ununsed argument (method = "euclidean")
You have to install the "amap" package to be able to use the "Kmeans" (with capital K) function which is different from the "kmeans" function.
Here's the link to the documentation: https://www.rdocumentation.org/packages/amap/versions/0.8-18/topics/Kmeans

Retrain mxnet model in R

I have created a neural network with mxnet. Now I want to train this model iteratively on new data points. After I simulated a new data point I want to make a new gradient descent update on this model. I do not want to save the model to an external file and load it again.
I have written the following code, but the weights do not change after a new training step. I also get NaN as a training error.
library(mxnet)
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 2, no.bias = TRUE)
lro <- mx.symbol.LinearRegressionOutput(fc1)
# first data observation
train.x = matrix(0, ncol = 3)
train.y = matrix(0, nrow = 2)
# first training step
model = mx.model.FeedForward.create(lro,
X = train.x, y = train.y, initializer = mx.init.uniform(0.001),
num.round = 1, array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
# second data observation
train.x = matrix(0, ncol = 3)
train.x[1] = 1
train.y = matrix(0, nrow = 2)
train.y[1] = -33
# retrain model on new data
# pass on params of old model
model = mx.model.FeedForward.create(symbol = model$symbol,
arg.params = model$arg.params, aux.params = model$aux.params,
X = train.x, y = train.y, num.round = 1,
array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
# weights do not change
print(model$arg.params)
I found a solution. begin.round in the second training step must be greater than num.round in the first training step, so that the model continues to train.
library(mxnet)
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 2, no.bias = TRUE)
lro <- mx.symbol.LinearRegressionOutput(fc1)
# first data observation
train.x = matrix(0, ncol = 3)
train.y = matrix(0, nrow = 2)
# first training step
model = mx.model.FeedForward.create(lro,
X = train.x, y = train.y, initializer = mx.init.uniform(0.001),
num.round = 1, array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
# second data observation
train.x = matrix(0, ncol = 3)
train.x[1] = 1
train.y = matrix(0, nrow = 2)
train.y[1] = -33
# retrain model on new data
# pass on params of old model
model = mx.model.FeedForward.create(symbol = model$symbol,
arg.params = model$arg.params, aux.params = model$aux.params,
X = train.x, y = train.y, begin.round = 2, num.round = 3,
array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
did you try to call mx.model.FeedForward.create only once and then use the fit function for incremental training?

Resources