R: Error with Nelder-Mead optimization to select starting values - r

The data can be found here
library(nlme)
library(dfoptim)
dat0 <- read.table("aids.dat2",head=T)
dat1 <- dat0[dat0$day<=90, ] # use only first 90-day data
dat2 <- dat1[!apply(is.na(dat1),1,any),] # remove missing data
aids.dat <- groupedData(lgcopy ~ day | patid, data=dat2)
aids.dat$log10copy = log10(aids.dat$lgcopy)
myfun2 <- function(arg){
s.p1 <- arg[1]
s.b1 <- arg[2]
s.p2 <- arg[3]
s.b2 <- arg[4]
model = nlme(log10copy ~ exp(p1-b1*day) + exp(p2-b2*day),
fixed = list(p1 ~ 1, b1 ~ 1, p2 ~ 1, b2 ~ 1),
random = list(patid = pdDiag(list(p1 ~ 1, b1 ~ 1, p2 ~ 1, b2 ~ 1))),
start = list(fixed = c(p1 = s.p1, b1 = s.b1, p2 = s.p2, b2 = s.b2)),
data =aids.dat)
return(model$logLik)
}
nmkb(fn = myfun2, par = c(10,0.5,6,0.005), lower = c(5, 0.1, 5, 0.001), upper = c(15, 1, 10, 0.1))
Running the above code, I run into several errors:
Error in nlme.formula(log10copy ~ exp(p1 - b1 * day) + exp(p2 - b2 * day), :
step halving factor reduced below minimum in PNLS step
In addition: Warning message:
In nlme.formula(log10copy ~ exp(p1 - b1 * day) + exp(p2 - b2 * day), :
Singular precision matrix in level -1, block 1
The model fits fine with the staring values from par = c(10,0.5,6,0.005). However, I think as the random algorithm starts using other starting values in lower = c(5, 0.1, 5, 0.001), upper = c(15, 1, 10, 0.1) the nlme call runs into the above problems because it's so sensitive to starting values. As a result, the nmkb call never amounts to anything.
Is there a way to circumvent this?

The model log-liklihood needs to be maximized, but many optimization procedures in R gives minimization result. So the function to be optimized has to be the negative log-likelihood. So it should look like this:
myfun2 <- function(arg){
s.p1 <- arg[1]
s.b1 <- arg[2]
s.p2 <- arg[3]
s.b2 <- arg[4]
model = nlme(log10copy ~ exp(p1-b1*day) + exp(p2-b2*day),
fixed = list(p1 ~ 1, b1 ~ 1, p2 ~ 1, b2 ~ 1),
random = list(patid = pdDiag(list(p1 ~ 1, b1 ~ 1, p2 ~ 1, b2 ~ 1))),
start = list(fixed = c(p1 = s.p1, b1 = s.b1, p2 = s.p2, b2 = s.b2)),
data =aids.dat)
return(-model$logLik)
}
And although there are still many warnings, there is no more error on my machine, and the algorithm converges successfully.
$par
[1] 13.460199068 0.848526199 7.764024099 0.001513636
$value
[1] -358.6631
$feval
[1] 197
$restarts
[1] 0
$convergence
[1] 0
$message
[1] "Successful convergence"
Warning messages:
1: In nlme.formula(log10copy ~ exp(p1 - b1 * day) + exp(p2 - b2 * day), :
Singular precision matrix in level -1, block 1

Related

How can I find the maximum output of a function

If I have a GLM, is there any way I can efficiently find the maximum output by changing one covariate and holding the others?
Using my simulated data:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
I am fitting a fractional polynomial GLM:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
Given that I am holding sex as constant, is there any way to find the value of duration within a certain range that gives me the highest predicted value? Something less inefficient than:
range <- seq(10, 20, by = 1e-4)
range[which.max(predict(model, type = "response", newdata = data.frame(duration = range, sex = "Male")))]
You can use optimize here. Just create a function which returns a prediction based on the value of duration:
f <- function(x) predict(model1, list(sex = 'Male', duration = x))
And we can find the value of duration which produces the maximum log odds within the range 0-20 by doing:
optimise(f, c(0, 20), maximum = TRUE)$maximum
#> [1] 17.95679

Find value of covariate given a probability in R

Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8

Creating Survival Trees with MST package: Undefined Columns Error?

I am trying to create a survival Tree with the MST package from R. I have been looking into this paper.
I replicated their example with randomly generated Data and it works just fine. I adjusted my data to fit the same model. My data has the same columns and the same datatypes.
I keep getting this error:
Error in `[.data.frame`(mf_data[col.split.var], , 3) : undefined columns selected
with the following line of code:
fit <- MST(formula = Surv(time,status) ~ x1 + | id), data = data)
I have looked through all of the documentation and I didnt find anything and I can't understand why this error appears.
The code form the paper looks like this:
set.seed(186117)
data <- rmultime(N = 200, K = 4, beta = c(-1, 0.8, 0.8, 0, 0),cutoff = c(0.5, 0.3, 0, 0), model = "marginal.multivariate.exponential", rho = 0.65)$dat
test <- rmultime(N = 100, K = 4, beta = c(-1, 0.8, 0.8, 0, 0), cutoff = c(0.5, 0.3, 0, 0), model = "marginal.multivariate.exponential",rho = 0.65)$dat
fit <- MST(formula = Surv(time, status) ~ x1 + x2 + x3 + x4 | id,data, test, method = "marginal", minsplit = 100, minevents = 20,selection.method = "test.sample")
I tried running your code and I do get an error although not the one you are getting and I'm fairly sure after looking at it that you need to use the [edit] features of SO to modify your question.
> fit <- MST(formula = Surv(time,status) ~ x1 + | id), data = data)
Error: unexpected '|' in "fit <- MST(formula = Surv(time,status) ~ x1 + |"
The formula give is obviously wrong and there is an unnecesary closing parentheses. I am able to get teh error you report with:
> fit <- MST(formula = Surv(time,status) ~ x1 | id, data = data)
[1] "No test sample supplied, changed selection.method = 'bootstrap'"
Error in `[.data.frame`(mf_data[col.split.var], , 3) :
undefined columns selected
.... but not with the original code:
fit <- MST(formula = Surv(time, status) ~ x1 + x2 + x3 + x4 | id,data, test, method = "marginal", minsplit = 100, minevents = 20,selection.method = "test.sample")
I also see an erroir with x1+x2|id on the RHS of the formula but not with three variables:
> fit <- MST(formula = Surv(time, status) ~ x1 +x2 | id,data, test, method = "marginal", minsplit = 100, minevents = 20,selection.method = "test.sample")
Error in `[.data.frame`(mf_data[col.split.var], , 3) :
undefined columns selected
> fit <- MST(formula = Surv(time, status) ~ x1 +x2+x3| id,data, test, method = "marginal", minsplit = 100, minevents = 20,selection.method = "test.sample")
So I'm thinking is is a bug that the developers had not anticipated. Here's how to obtain the needed email address to report:
> maintainer("MST")
[1] "Peter Calhoun <calhoun.peter#gmail.com>"

`nlme` with crossed random effects

I am trying to fit a crossed non-linear random effect model as the linear random effect models as mentioned in this question and in this mailing list post using the nlme package. Though, I get an error regardless of what I try. Here is an example
library(nlme)
#####
# simulate data
set.seed(18112003)
na <- 30
nb <- 30
sigma_a <- 1
sigma_b <- .5
sigma_res <- .33
n <- na*nb
a <- gl(na,1,n)
b <- gl(nb,na,n)
u <- gl(1,1,n)
x <- runif(n, -3, 3)
y_no_noise <- x + sin(2 * x)
y <-
x + sin(2 * x) +
rnorm(na, sd = sigma_a)[as.integer(a)] +
rnorm(nb, sd = sigma_b)[as.integer(b)] +
rnorm(n, sd = sigma_res)
#####
# works in the linear model where we know the true parameter
fit <- lme(
# somehow we found the right values
y ~ x + sin(2 * x),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))))
vv <- VarCorr(fit)
vv2 <- vv[c("a1", "b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R a1 1.016 1.0082
#R b1 0.221 0.4701
#####
# now try to do the same with `nlme`
fit <- nlme(
y ~ c0 + sin(c1),
fixed = list(c0 ~ x, c1 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(~ a - 1), pdIdent(~ b - 1)))),
start = c(0, 0.5, 1))
#R Error in nlme.formula(y ~ a * x + sin(b * x), fixed = list(a ~ 1, b ~ :
#R 'random' must be a formula or list of formulae
The lme example is similar to the one page 163-166 of "Mixed-effects Models in S and S-PLUS" with only 2 random effects instead of 3.
I should haved used a two-sided formula as written in help("nlme")
fit <- nlme(
y ~ c0 + c1 + sin(c2),
fixed = list(c0 ~ 1, c1 ~ x - 1, c2 ~ x - 1),
random = list(u = pdBlocked(list(pdIdent(c0 ~ a - 1), pdIdent(c1 ~ b - 1)))),
start = c(0, 0.5, 1))
# fixed effects estimates
fixef(fit)
#R c0.(Intercept) c1.x c2.x
#R -0.1788218 0.9956076 2.0022338
# covariance estimates
vv <- VarCorr(fit)
vv2 <- vv[c("c0.a1", "c1.b1"), ]
storage.mode(vv2) <- "numeric"
print(vv2,digits=4)
#R Variance StdDev
#R c0.a1 0.9884 0.9942
#R c1.b1 0.2197 0.4688

How to run nlxb and wrapnls inside dplyr?

I am trying to fit many nonlinear fits using wrapnls in parallel using dplyr and broom (and eventually mclapply), but I am getting a parsing evaluation error from nlxb:
Error in parse(text = joe) (from #11) : <text>:1:6: unexpected input
1: b1.10% <- 20
I get this error using both do and lapply approaches.
library(nlmrt)
library(dplyr)
library(purrr)
library(broom)
data_frame(x = seq(0, 200, 0.1),
y = 1.2*exp(-(times - 10)^2/(2*4.2^2)) + 2.4*exp(-(times - 50)^2/(2*3.8^2)) + 5.3*exp(-(times - 80)^2/(2*5.1^2)) + rnorm(length(times), sd = 0.05)) %>%
do({
xl <- quantile(.$x, 0.1, na.rm = TRUE)
xm <- quantile(.$x, 0.5, na.rm = TRUE)
xh <- quantile(.$x, 0.8, na.rm = TRUE)
starts <- c(a1 = 5, a2 = 5, a3 = 5,
b1 = xl, b2 = xm, b3 = xh,
c1 = 5, c2 = 5, c3 = 5)
fmla <- y ~ a1*exp(-(x - b1)^2/(2*c1^2)) + a2*exp(-(x - b2)^2/(2*c2^2)) + a3*exp(-(x - b3)^2/(2*c3^2))
df <- data_frame(x = .$x, y = .$y)
mod <- wrapnls(fmla, lower = 0, upper = 200, start = starts, data = df)
tidy(mod)
})
Is there any way around this?
The problem isn't with the do aspect, it's the code inside the do, so you can debug that part directly. The starts vector is getting the b# names concatenated with the quantiles:
names(starts)
## [1] "a1" "a2" "a3" "b1.10%" "b2.50%" "b3.80%" "c1" "c2" "c3"
Adding unname to the quantile calculation fixes the issue.
data_frame(x = seq(0, 200, 0.1),
y = 1.2*exp(-(x - 10)^2/(2*4.2^2)) + 2.4*exp(-(x - 50)^2/(2*3.8^2)) + 5.3*exp(-(x - 80)^2/(2*5.1^2)) + rnorm(length(x), sd = 0.05)) %>%
do({
xl <- quantile(.$x, 0.1, na.rm = TRUE) %>% unname()
xm <- quantile(.$x, 0.5, na.rm = TRUE) %>% unname()
xh <- quantile(.$x, 0.8, na.rm = TRUE) %>% unname()
starts <- c(a1 = 5, a2 = 5, a3 = 5,
b1 = xl, b2 = xm, b3 = xh,
c1 = 5, c2 = 5, c3 = 5)
fmla <- y ~ a1*exp(-(x - b1)^2/(2*c1^2)) + a2*exp(-(x - b2)^2/(2*c2^2)) + a3*exp(-(x - b3)^2/(2*c3^2))
df <- data_frame(x = .$x, y = .$y)
mod <- wrapnls(fmla, lower = 0, upper = 200, start = starts, data = df)
tidy(mod)
})
## term estimate std.error statistic p.value
## 1 a1 2.386492 0.007455097 320.1155 0
## 2 a2 5.296250 0.006437509 822.7174 0
## 3 a3 1.199384 0.007132559 168.1562 0
## 4 b1 49.997697 0.013702894 3648.6960 0
## 5 b2 80.004023 0.007150546 11188.5193 0
## 6 b3 10.077847 0.028644821 351.8209 0
## 7 c1 3.798829 0.013702940 277.2273 0
## 8 c2 5.094727 0.007150573 712.4921 0
## 9 c3 4.175235 0.028944448 144.2499 0

Resources