multinomial-hmm using R - r

I have the following kind of data:
df <- data.frame(id = rep(1, each = 40),
year = seq(1961,2000),
x1 = rbinom(40, size = 1, prob = 1 - 0.6) * rpois(40, lambda = 4),
X2 = rbinom(40, size = 1, prob = 1 - 0.7) * rpois(40, lambda = 4),
X3 = rbinom(40, size = 1, prob = 1 - 0.6) * rpois(40, lambda = 5),
X4 = rbinom(40, size = 1, prob = 1 - 0.7) * rpois(40, lambda = 6))
As you can see in my data there are four count variables.
I want to estimate a multinomial-HMM, as I expect that there is a latent variable, C, with 3 possible states that affect Pr(X_t=xt) (each vector X_t is assumed to be mutually independent conditional on the Markov chain C_t). For example, I expect that if C_t=1 we would observe a vector of X_t more like this (4,1,0,0), if C_T=2 a vector of X_t more like this (0,1,1,0) and if C=3 it is more likely to observe a vector of X_t like this (0,0,1,5).
I haven't found a package able to estimate this type of model, so currently, I am using the depmixS4 package.
library(depmixS4)
mod<-depmix(list(X1 ~ 1, X2 ~ 1, X3 ~ 1, X4 ~ 1), data=df, nstates=3,
family=list(poisson(), poisson(), poisson(),poison()))
However, I am not sure this would be a correct model according to my theoretical expectations. Can I use depmix differently to be more suitable to my model?

You could simply use a multinomial distribution for the response. I'm assuming that you mean to let X1, ..., X4 refer to four levels of a single categorical variable? And each of these variables then contains the count of a level in a particular year? One option is then to fit the following model:
> library(depmixS4)
Loading required package: nnet
Loading required package: MASS
Loading required package: Rsolnp
Loading required package: nlme
> set.seed(1)
> df <- data.frame(id = rep(1, each = 40),
+ year = seq(1961,2000),
+ X1 = rbinom(40, size = 1, prob = 1 - 0.6) * rpois(40, lambda = 4),
+ X2 = rbinom(40, size = 1, prob = 1 - 0.7) * rpois(40, lambda = 4),
+ X3 = rbinom(40, size = 1, prob = 1 - 0.6) * rpois(40, lambda = 5),
+ X4 = rbinom(40, size = 1, prob = 1 - 0.7) * rpois(40, lambda = 6))
> # matrix for single multinomial response variable
> X <- as.matrix(df[,c("X1", "X2", "X3", "X4")])
>
> # formulate model
> mod<-depmix(X ~ 1, data=df, nstates=3,
+ family=multinomial("identity"))
>
> # fit model
> fmod <- fit(mod)
converged at iteration 22 with logLik: -161.5714
>
> # show results
> summary(fmod)
Initial state probabilities model
pr1 pr2 pr3
0 0 1
Transition matrix
toS1 toS2 toS3
fromS1 0.290 0.355 0.356
fromS2 0.132 0.328 0.540
fromS3 0.542 0.191 0.267
Response parameters
Resp 1 : multinomial
Re1.pr1 Re1.pr2 Re1.pr3 Re1.pr4
St1 0.092 0.56 0.288 0.061
St2 0.033 0.00 0.423 0.544
St3 0.608 0.00 0.392 0.000

Related

How can I find the maximum output of a function

If I have a GLM, is there any way I can efficiently find the maximum output by changing one covariate and holding the others?
Using my simulated data:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
I am fitting a fractional polynomial GLM:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
Given that I am holding sex as constant, is there any way to find the value of duration within a certain range that gives me the highest predicted value? Something less inefficient than:
range <- seq(10, 20, by = 1e-4)
range[which.max(predict(model, type = "response", newdata = data.frame(duration = range, sex = "Male")))]
You can use optimize here. Just create a function which returns a prediction based on the value of duration:
f <- function(x) predict(model1, list(sex = 'Male', duration = x))
And we can find the value of duration which produces the maximum log odds within the range 0-20 by doing:
optimise(f, c(0, 20), maximum = TRUE)$maximum
#> [1] 17.95679

Find value of covariate given a probability in R

Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8

Confusion about 'standardize' option of glmnet package in R

I have a confusion about the standardize option of glmnet package in R. I get different coefficients when I standardize the covariates matrix and set standardize=FALSE vs. when I do not standardize the covariates matrix and set standardize=TRUE. I assumed they would be the same! These two are shown with an example by creating the following ridge.mod1 and ridge.mod2 models. I also created a model (ridge.mod3) that standardized the outcome (and the covariates matrix) and used the option standardize=FALSE. I was just checking if I needed to standardize the outcome too to get the same coefficients as in ridge.mod1.
set.seed(1)
y <- rnorm(30, 20, 10)
x1 <- rnorm(30, 5, 2)
x2 <- x1 + rnorm(30, 0, 5)
cor(x1,x2)
x <- as.matrix(cbind(x1,x2))
z1 <- scale(x1)
z2 <- scale(x2)
z <- as.matrix(cbind(z1,z2))
y.scale <- scale(y)
n <- 30
# Fixing foldid for proper comparison
foldid=sample(rep(seq(5),length=n))
table(foldid)
library(glmnet)
cv.ridge.mod1 <- cv.glmnet(x, y, alpha = 0, nfolds = 5, foldid=foldid, standardize = TRUE)
ridge.mod1 <- glmnet(x, y, alpha = 0, standardize = TRUE)
coef(ridge.mod1, s=cv.ridge.mod1$lambda.min)
> coef(ridge.mod1, s=cv.ridge.mod1$lambda.min)
3 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 2.082458e+01
x1 2.856136e-37
x2 4.334910e-38
cv.ridge.mod2 <- cv.glmnet(z, y, alpha = 0, nfolds = 5, foldid=foldid, standardize = FALSE)
ridge.mod2 <- glmnet(z, y, alpha = 0, standardize = FALSE)
coef(ridge.mod2, s=cv.ridge.mod2$lambda.min)
> coef(ridge.mod2, s=cv.ridge.mod2$lambda.min)
3 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 2.082458e+01
V1 4.391657e-37
V2 2.389751e-37
cv.ridge.mod3 <- cv.glmnet(z, y.scale, alpha = 0, nfolds = 5, foldid=foldid, standardize = FALSE)
ridge.mod3 <- glmnet(z, y.scale, alpha = 0, standardize = FALSE)
coef(ridge.mod3, s=cv.ridge.mod3$lambda.min)
> coef(ridge.mod3, s=cv.ridge.mod3$lambda.min)
3 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1.023487e-16
V1 4.752255e-38
V2 2.585973e-38
Could anyone please tell me what's going on there and if (or how) I can get the same coefficients as in ridge.mod1 with prior standardization (in the data processing step) and then using standardize=FALSE?
Update: (what I tried based on the comments below)
So, I tried standardizing by SS/n instead of SS/(n-1). I tried by standardizing both y and x. Neither gave me coefficients equal to the de-standardized coefficients of model 1.
## Standadizing by sqrt(SS(X)/n) like glmnet instead of sqrt(SS(X)/(n-1)) which is done by the scale command
Xs <- apply(x, 2, function(m) (m - mean(m)) / sqrt(sum(m^2) / n))
Ys <- (y-mean(y)) / sqrt(sum(y^2) / n)
# Standadizing only X by sqrt(SS(X)/n)
cv.ridge.mod4 <- cv.glmnet(Xs, y, alpha = 0, nfolds = 5, foldid=foldid, standardize = FALSE)
ridge.mod4 <- glmnet(Xs, y, alpha = 0, standardize = FALSE)
coef(ridge.mod4, s=cv.ridge.mod4$lambda.min)
> coef(ridge.mod4, s=cv.ridge.mod4$lambda.min)[2]/sd(x1)
[1] 7.995171e-38
> coef(ridge.mod4, s=cv.ridge.mod4$lambda.min)[3]/sd(x2)
[1] 2.957854e-38
# Standadizing both Y and X by sqrt(SS(X)/n) but neither is centered
cv.ridge.mod6 <- cv.glmnet(Xs.noncentered, Ys.noncentered, alpha = 0, nfolds = 5, foldid=foldid, standardize = FALSE)
ridge.mod6 <- glmnet(Xs.noncentered, Ys.noncentered, alpha = 0, standardize = FALSE)
coef(ridge.mod6, s=cv.ridge.mod6$lambda.min)
> coef(ridge.mod6, s=cv.ridge.mod6$lambda.min)[2] / (sqrt(sum(x1^2) / n))
[1] 1.019023e-39
> coef(ridge.mod6, s=cv.ridge.mod6$lambda.min)[3] / (sqrt(sum(x2^2) / n))
[1] 9.189263e-40
What is it that still is wrong there?
I tweaked your code so that I can work with a more sensible problem. In order to reproduce the coefficients changing the standardize=TRUE and standardize=FALSE options you need to first standardize the variables with the (1/N) variance estimator formula. For this example I also centered the variables to get rid of the constant. I focus only on the coefficients of the variables. After that you have to notice that hence you have to invert that formula to get the de-standardized coefficients. I do that in the following code.
set.seed(1)
x1 <- rnorm(300, 5, 2)
x2 <- x1 + rnorm(300, 0, 5)
x3 <- rnorm(300, 6, 5)
e= rnorm(300, 0, 1)
y <- 0.3*x1+3.5*x2+x3+e
x <- as.matrix(cbind(x1,x2,x3))
sdN=function(x){
sigma=sqrt( (1/length(x)) * sum((x-mean(x))^2))
return(sigma)
}
n=300
foldid=sample(rep(seq(5),length=n))
g1=(x1-mean(x1))/sdN(x1)
g2=(x2-mean(x2))/sdN(x2)
g3=(x3-mean(x3))/sdN(x3)
gy=(y-mean(y))/sdN(y)
equis <- as.matrix(cbind(g1,g2,g3))
library(glmnet)
cv.ridge.mod1 <- cv.glmnet(x, y, alpha = 0, nfolds = 5, foldid=foldid,standardize = TRUE)
coef(cv.ridge.mod1, s=cv.ridge.mod1$lambda.min)
cv.ridge.mod2 <- cv.glmnet(equis, gy, alpha = 0, nfolds = 5, foldid=foldid, standardize = FALSE, intercept=FALSE)
beta=coef(cv.ridge.mod2, s=cv.ridge.mod2$lambda.min)
beta[2]*sdN(y)/sdN(x1)
beta[3]*sdN(y)/sdN(x2)
beta[4]*sdN(y)/sdN(x3)
coef(cv.ridge.mod1, s=cv.ridge.mod1$lambda.min)
this yields the results:
> beta[2]*sdN(y)/sdN(x1)
[1] 0.5984356
> beta[3]*sdN(y)/sdN(x2)
[1] 3.166033
> beta[4]*sdN(y)/sdN(x3)
[1] 0.9145646
>
> coef(cv.ridge.mod1, s=cv.ridge.mod1$lambda.min)
4 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 0.5951423
x1 0.5984356
x2 3.1660328
x3 0.9145646
As you can see the coefficients are the same at 4 decimals. So I hope this answer your question.

compute numbers of leaf in rpart

i use rpart run a regression tree
library(MASS)
N = 1000
episolon = rnorm(N, 0, 0.01)
x1 = rnorm(N, 0, sd=1)
x2 = rnorm(N, 0, sd=1)
eta_x = 1/2*x1+x2
Kappa_x = 1/2*x1
w = rbinom(N, 1, 0.5)
treatment = w
makeY = function(eta, Kappa){
Y = eta+1/2*(2*w-1)*Kappa+episolon
}
Y1 = makeY(eta_x, Kappa_x)
fit = rpart(Y1 ~ x1 + x2)
plot(fit)
text(fit)
compute numbers of leaf in rpart
I want to have a function to give me there are 12 leaves in this tree
The fit object has all the information that you need. You can examine it using str(fit).
Two ways to find the number of leaves are:
sum(fit$frame$ncompete == 0)
[1] 11
AND
sum(fit$frame$var == "<leaf>")
[1] 11

Getting error when using plot.gbm to produce marginal plots

Total novice to R -
I am trying to make some marginal plots from a BRT I completed with the gbm package and keep getting the same error.
Below is my code; boosted.tree_LRFF is the output I got from completing a gbm.fit
> plot.gbm(boosted.tree_LRFF,
+ i.var= 5,
+ n.trees = train.model$finalModel$tuneValue$n.trees,
+ continuous.resolution = 100,
+ return.grid = FALSE,
+ type = "link")
Error in plot.window(...) : need finite 'ylim' values
In addition: Warning messages:
1: In min(x) : no non-missing arguments to min; returning Inf
2: In max(x) : no non-missing arguments to max; returning -Inf
In this first section I just re-create the dataset used to fit a gbm from the "gbm.pdf" for the package:
library(gbm)
N <- 1000
X1 <- runif(N)
X2 <- 2 * runif(N)
X3 <- ordered(sample(letters[1:4], N, replace = TRUE), levels = letters[4:1])
X4 <- factor(sample(letters[1:6], N, replace = TRUE))
X5 <- factor(sample(letters[1:3], N, replace = TRUE))
X6 <- 3 * runif(N)
mu <- c(-1, 0, 1, 2)[as.numeric(X3)]
SNR <- 10 # signal-to-noise ratio
Y <- X1 ** 1.5 + 2 * (X2 ** .5) + mu
sigma <- sqrt(var(Y) / SNR)
Y <- Y + rnorm(N, 0, sigma)
# introduce some missing values
X1[sample(1:N, size = 500)] <- NA
X4[sample(1:N, size = 300)] <- NA
data <- data.frame(Y = Y, X1 = X1, X2 = X2, X3 = X3, X4 = X4, X5 = X5, X6 = X6)
boosted.tree_LRFF <-
gbm(Y ~ X1 + X2 + X3 + X4 + X5 + X6,
data = data,
var.monotone = c(0, 0, 0, 0, 0, 0),
distribution = "gaussian",
n.trees = 1000,
shrinkage = 0.05,
interaction.depth = 3,
bag.fraction = 0.5,
train.fraction = 0.5,
n.minobsinnode = 10,
cv.folds = 3,
keep.data = TRUE,
verbose = FALSE,
n.cores = 1)
Now I plot the tree function values for variable x5, similar to your plot:
plot(boosted.tree_LRFF,
i.var = 5,
n.trees = boosted.tree_LRFF$n.trees,
continuous.resolution = 100,
return.grid = FALSE,
type = "link")
I think your error is due to the n.trees argument. You can either enter it as a constant or it can be from a GBM fitted object. In my example I used it from "boosted.tree_LRFF" that appears to be the name of the original fitted object in your example (although of course my data was different).

Resources