How do I find best starting values for nlimb optimization? - r

I am trying to run the following code with the attached dataset. How do I solve the error of hessian matrix inversion?
library(stats4)
library(bbmle)
library(stats)
library(numDeriv)
library('bbmle')
x<- c(1.1, 1.4, 1.3, 1.7,1.9, 1.8, 1.6, 2.2, 1.7, 2.7, 4.1, 1.8, 1.5, 1.2, 1.4, 3, 1.7, 2.3, 1.6, 2.0)
hist(x)
fEHLKUMW<-function(a,b,alpha,vartheta)
{
-sum(log( (2*a*b*alpha*vartheta*(x^(vartheta-1))*(exp(-x^(vartheta)))*((1- (exp(-x^(vartheta))))^(a-1))*((1-((1-((1-(exp(-x^(vartheta))))^a))^b))^(alpha-1)))/((1-((1-(exp(-x^(vartheta))))^a))^(b*(alpha+1)))
))
}
EHLKUMW.result<-mle2(fEHLKUMW,hessian = NULL,start=list(a=0.01,b=0.01,alpha=.3,vartheta=0.01),optimizer="nlminb",lower=0)
summary(EHLKUMW.result)
I get the error as;
**
Warning messages:
1: In nlminb(start = start, objective = objectivefunction, hessian = NULL, :
NA/NaN function evaluation
2: In mle2(fEHLKUML, hessian = NULL, start = list(a = 1, b = 0.4, c = 0.5, :
couldn't invert Hessian
**

This is a very open question I think, so I'll present some tools and an approach to this, and maybe others can comment, etc.
First, the main part:
library(bbmle)
library(stats)
library(numDeriv)
library(bbmle)
x<- c(1.1, 1.4, 1.3, 1.7,1.9, 1.8, 1.6, 2.2, 1.7, 2.7, 4.1, 1.8, 1.5, 1.2, 1.4, 3, 1.7, 2.3, 1.6, 2.0)
hist(x)
fEHLKUMW <- function(a,b,alpha,vartheta) {
-sum(log( (2*a*b*alpha*vartheta*(x^(vartheta-1))*(exp(-x^(vartheta)))*((1- (exp(-x^(vartheta))))^(a-1))*((1-((1-((1-(exp(-x^(vartheta))))^a))^b))^(alpha-1)))/((1-((1-(exp(-x^(vartheta))))^a))^(b*(alpha+1)))
))
}
Now, we can of course run it like you've done:
EHLKUMW.result <- mle2(
fEHLKUMW,
hessian = NULL,
start = list(
a = 0.01,
b = 0.01,
alpha = .3,
vartheta = 0.01
),
optimizer = "nlminb",
lower = 0
)
But we can also run it with a distribution on each of these parameters, to get a new input all the time:
EHLKUMW.result <- mle2(
fEHLKUMW,
hessian = NULL,
start =
list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
),
optimizer = "nlminb",
lower = 0
)
I've chosen to use trundist to get a t-distribution, centralised around
the values you provided, and lower is 0 through the a = -argument.
If you know what the upper limit of these parameters, this can be done with b = -argument.
The output that I think is most relevant are the attained logLik and the coef.
library(tidyverse)
exec(fEHLKUMW, !!!list(
a = 0.01,
b = 0.01,
alpha = .3,
vartheta = 0.01
))
replicate(
250,
exec(fEHLKUMW, !!!list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
)))
I used this to fine-tune the distributions the parameters
The following is a multiple runs and their output compared to
logLik.
tibble(n = seq_len(100),
output = map(n, ~mle2(
fEHLKUMW,
hessian = NULL,
start =
list(
# a = 0.01,
# a = rt(1, 10, ncp = 0.01),
a = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# b = 0.01,
b = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10),
# alpha = .3,
alpha = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.3, df = 10),
# vartheta = 0.01
vartheta = truncdist::rtrunc(1, spec = "t", a = 0, ncp = 0.01, df = 10)
),
optimizer = "nlminb",
lower = 0
))) ->
outputs_df
This code gives a nice print
outputs_df %>%
mutate(coef = output %>% map(coef),
logLik = output %>% map_dbl(logLik)) %>%
unnest_wider(coef) %>%
arrange(logLik) %>%
print(n=Inf)

Related

Error in sampleN2diffmeans(delta = 4, sigma = 2.5, alpha = 0.05, power = 0.9, : could not find function "sampleN2diffmeans"

Error in sampleN2diffmeans(delta = 4, sigma = 2.5, alpha = 0.05, power = 0.9, :
could not find function "sampleN2diffmeans"
Uploaded packages are pwr and rpact.
Are there any alternative for this?

Iterate over character vector in dplyR summarise and use it to assign new column names

I have a character vector with the name of my variables:
variables -> c("w", "x", "y", "z")
I need to create a function that calculates the mean of every variable for a specified parameter (as below for alpha). However, it doesn't rename the columns with the iterating variable names and does not reduce the alpha columns down to one on the left.
calc <- function(df,
parameter,
iteration,
variables){
variable <- sym(variables[iteration])
mean <- df %>% group_by(.dots = parameter) %>%
summarise(variable = mean(!!variable),sd_variable = sd(!!variable))
return(mean)
}
means <- map_dfc(1:length(variables), ~calc(df = input,
parameter = "alpha",
iteration = .,
variables = variables))
Ideally the output df (means) would look like this:
alpha | w | sd_w | x | sd_x | y | sd_y | z | sd_z |
Here is what the input df looks like:
structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 999.5, 999.6,
999.7, 999.8, 999.9, 1000), w = c(10, 10.0057192322758, 10.0198266325956,
10.040096099625, 10.0637654242843, 10.087779652849, 0.889708853982268,
0.890916575744663, 0.892121389863897, 0.89332329218135, 0.894522278550115,
0.895718344834999), x = c(10, 11.0467963604334, 12.1782709261765,
13.3728962503142, 14.6035317074526, 15.8398164069251, 62.6631746231113,
62.6583134156356, 62.6534565303638, 62.648604016965, 62.6437559251575,
62.6389123047088), y = c(10, 9.89605687874935, 9.59253574727296,
9.11222320249057, 8.48917353431654, 7.76447036695841, 0.00833796964522317,
0.00835876233547079, 0.00837957883570158, 0.00840041916631544,
0.00842128334742553, 0.00844217139885453), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
-7.50539460838544, -7.48234149534558, -7.45927733670319, -7.43620225192078,
-7.41311636057114, -7.39001978233681), alpha = c(0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1), lambda = c(0.95,
0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95
), phi = c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5), kappa = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "3.9995", "3.9996", "3.9997",
"3.9998", "3.9999", "3.10000"), class = "data.frame")
Ideally the function would use dplyr and/or baseR.
If I understand you correctly, there's no need to iterate over columns. It can all be done directly in dplyr...
library(tidyverse)
df %>%
group_by(alpha) %>%
summarise(
across(
c(w, x, y, z),
list(mean=mean, sd=sd)
),
.groups="drop"
) %>%
rename_with(function(x) str_sub(x,1,1), ends_with("mean"))
# A tibble: 2 x 9
alpha w w_sd x x_sd y y_sd z z_sd
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.1 10.0 0.0345 12.8 2.20 9.14 0.875 7.99 1.38
2 0.5 0.893 0.00225 62.7 0.00908 0.00839 0.0000390 -7.45 0.0432

plotly R reordering factor based on numeric variable

I need to reorder the x-axis categorical variable with a numeric variable so the plot is simpler to follow with plotly, in R.
Here is some dummy data:
agg = structure(list(catvar = c("a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o"), v1 = c(1.1, 1.3, 1,
0.8, 1.2, 1.4, 0.7, 2.5, 0.9, 2.5, 1.7, 0.9, 1.7, 1.1, 0.9),
v2 = c(0.1, 0.1, 0.1, 0, 0.1, 0.1, 0, 0.2, 0, 0.2, 0.1, 0.1,
0.1, 0.1, 0.1), v3 = c(7.3, 8.2, 6.4, 6, 7.5, 8.4, 5.8, 12.4,
6.4, 13.1, 9.3, 6.2, 9.4, 7.1, 6.3)), .Names = c("catvar",
"v1", "v2", "v3"), class = "data.frame", row.names = c(NA, -15L
))
Here is what I've been trying out. It seems recently a new feature was added to handle this but it's not working.
# the reordering is handled here:
ax <- list(
type = "category",
categoryorder = "array",
categoryarray = agg$catvar[order(agg[['v1']])],
showgrid = TRUE,
showticklabels = TRUE
)
p <- plot_ly(agg, x =~catvar,
y =~v2, name = 'v2',
type = 'scatter', mode = 'lines') %>%
add_trace(y =~v1, name = 'v1') %>%
add_trace(y =~v3, name = 'v3') %>%
layout(
xaxis = ax,
hovermode = 'x+y')
p
All I need is to re-order the x-axis (catvar) based on any of the numeric varaibles in the data.frame, these are v1, v2, v3.
You need to add to your code a command for reordering your dataset according to v1.
agg2 <- agg[order(agg[['v1']]),]
p <- plot_ly(agg2, x =~catvar,
y =~v2, name = 'v2',
type = 'scatter', mode = 'points') %>%
add_trace(y =~v1, name = 'v1') %>%
add_trace(y =~v3, name = 'v3') %>%
layout(
xaxis = ax,
hovermode = 'x+y')
p

Error with using unlist, lapply and grepl in data.tables R

This question is an extension of this particular question. I have this particular data.table. I'm using data.table, mc2d, and e1071 libraries
library("data.table")
library("mc2d")
library("e1071")
col <- c("COST","TIME")
dt <- structure(
list(
ID = c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j"),COST_PR_L = c(NA, 0.4, 0.31, 0.4, 0.5, 0.17, 1, 0.5, 0.5, 0.5),COST_PR_U = c(7.5, 2, 2.67, 1.67, 2.4,2, 1.5, 2, 2, 1.67),COST_PO_L = c(NA, 0.33, 0.25, 0.44,0.5, 0.25, 1, 0.5, 0.5, 0.5),COST_PO_U = c(3, 1.43, 3.33,1.8, 2.4, 3.6, 1.45, 2, 1.5, 1.67), TIME_PR_L = c(NA, 0.5,0.4, 0.5, 0.5, NA, 0.67, 0.5, 0.5, 0.5), TIME_PR_U = c(2,2.5, 3, 1.5, 2, NA, 1.5, 2, 1.67, 2), TIME_PO_L = c(NA,0.4, 0.25, 0.56, 0.5, NA, 0.6, 0.5, 0.5, 0.5), TIME_PO_U = c(2,2, 5, 1.67, 2.5, NA, 1.5, 2, 1.67, 2)
),.Names = c("ID","COST_PR_L", "COST_PR_U","COST_PO_L","COST_PO_U","TIME_PR_L","TIME_PR_U","TIME_PO_L","TIME_PO_U"),class = c("data.table","data.frame"),row.names = c(NA,-10L))
When I run this particular operation on it,
dt[, unlist(lapply(col, function(xx) {
y = colnames(dt)[grepl(pattern = xx, x = colnames(dt))]
vars1 = y[grepl(pattern = "PR", x = y)]
vars2 = y[grepl(pattern = "PO", x = y)]
mn = get(vars1[1])
mx = get(vars1[2])
sk1 = ifelse(mn !=0 && mx !=0,skewness(rpert(1000, min = mn , mode = 1, max= mx )),-1)
mn = get(vars2[1])
mx = get(vars2[2])
sk2 = ifelse(mn !=0 && mx !=0,skewness(rpert(1000, min = mn , mode = 1, max= mx )),-1)
return(list(sk1, sk2))
}), recursive = FALSE)
, by = "ID"]
I get the following error
Error in [.data.table(dt, , unlist(lapply(col, function(xx) { :
Column 1 of result for group 2 is type 'double' but expecting type
'logical'. Column types must be consistent for each group.
However, If I remove the unlist in the code, It seems to calculate the answer. What is unlist doing that is messing it up?

SVM from e1071 R package replaces labels if there is a feature with only 1 unique value

Why SVM from e1071 package replaces original labels by "1" and "2", if there is at least one such column having only one unique value?
For example, the code below works correctly:
trainData <- data.frame("cA" = c(1, 1, 1, 0.99),
"cB" = c(0.5, 0.6, 0.5, 0.3),
"is_match" = factor(c("N", "N", "P", "P")))
testData <- data.frame("cA" = c(1, 1, 0, 0),
"cB" = c(0.2, 0.3, 0.2, 0.1))
model <- svm(is_match ~ ., data = trainData, type = "C-classification")
pred <- predict(model, testData, type = "class")
print(pred)
it returns
1 2 3 4
P P P P
However, if I change 0.99 to 1 in the first column - so that all values become the same - svm changes labels "N" and "P" to "1" and "2":
trainData <- data.frame("cA" = c(1, 1, 1, 1),
"cB" = c(0.5, 0.6, 0.5, 0.3),
"is_match" = factor(c("N", "N", "P", "P")))
testData <- data.frame("cA" = c(1, 1, 0, 0),
"cB" = c(0.2, 0.3, 0.2, 0.1))
model <- svm(is_match ~ ., data = trainData, type = "C-classification")
pred <- predict(model, testData, type = "class")
print(pred)
Such code returns:
1 2 3 4
2 2 2 2
Additional notes:
It happens with all possible values in column (zeros, NAs) as long as they are all the same for each instance
if labels are digits, svm doesn't replace them
other ML methods like rpart or ada works correctly

Resources