plotting log-likelihood function in R - r

Hi I have received an error of x and y length differ in my code. datasim is a simulated sample of size 1000. Please help me with this.
x <- datasim
loglik <- function(theta){
k<- theta[1]
lambda<- theta[2]
out <- sum(dweibull(x,shape = k, scale=lambda, log = TRUE) )
return(out)
}
theta<- c(0.5,1.5)
plot(theta, loglik(theta), type="l", lwd=3, main="logliklihood_Weibull, n=1000")

You can use a package since it's weibull you are fitting:
library(fitdistrplus)
x = rweibull(1000,20,10)
fit <- fitdist(x, "weibull")
llplot(fitg, expand=5)
Or using your loglik function:
library(lattice)
da = expand.grid(k=seq(8,22,length.out=50),
lambda = seq(9,12,length.out=50))
da$LL = apply(da,1,loglik)
wireframe(LL ~ k * lambda, data = da,
scales = list(arrows = FALSE),drape = TRUE, colorkey = TRUE)

Related

How to fit Gaussian distribution with one-sided data?

x <- c(-3,-2.5,-2,-1.5,-1,-0.5)
y <- c(2,2.5,2.6,2.9,3.2,3.3)
The challenge is that the entire data is from the left slope, how to generate a two-sided Gaussian Distribution?
There is incomplete information with regards to the question. Hence several ways can be implemented. NOTE that the data is insufficient. ie trying fitting tis by nls does not work.
Here is one way to tackle it:
f <- function(par, x, y )sum((y - par[3]*dnorm(x,par[1],par[2]))^2)
a <- optim(c(0, 1, 1), f, x = x, y = y)$par
plot(x, y, xlim = c(-3,3.5), ylim = c(2, 3.5))
curve(dnorm(x, a[1], a[2])*a[3], add = TRUE, col = 2)
There is no way to fit a Gaussian distribution with these densities. If correct y-values had been provided this would be one way of solving the problem:
# Define function to be optimized
f <- function(pars, x, y){
mu <- pars[1]
sigma <- pars[2]
y_hat <- dnorm(x, mu, sigma)
se <- (y - y_hat)^2
sum(se)
}
# Define the data
x <- c(-3,-2.5,-2,-1.5,-1,-0.5)
y <- c(2,2.5,2.6,2.9,3.2,3.3)
# Find the best paramters
opt <- optim(c(-.5, .1), f, 'SANN', x = x, y = y)
plot(
seq(-5, 5, length.out = 200),
dnorm(seq(-5, 5, length.out = 200), opt$par[1], opt$par[2]), type = 'l', col = 'red'
)
points(c(-3,-2.5,-2,-1.5,-1,-0.5), c(2,2.5,2.6,2.9,3.2,3.3))
Use nls to get a least squares fit of y to .lin.a * dnorm(x, b, c) where .lin.a, b and c are parameters to be estimated.
fm <- nls(y ~ cbind(a = dnorm(x, b, c)),
start = list(b = mean(x), c = sd(x)), algorithm = "plinear")
fm
giving:
Nonlinear regression model
model: y ~ cbind(a = dnorm(x, b, c))
data: parent.frame()
b c .lin.a
0.2629 3.2513 27.7287
residual sum-of-squares: 0.02822
Number of iterations to convergence: 7
Achieved convergence tolerance: 2.582e-07
The dnorm model (black curve) seems to fit the points although even a straight line (blue line) involving only two parameters (intercept and slope) instead of 3 isn't bad.
plot(y ~ x)
lines(fitted(fm) ~ x)
fm.lin <- lm(y ~ x)
abline(fm.lin, col = "blue")

r: coefficients from glmnet and caret are different for the same lambda

I've read a few Q&As about this, but am still not sure I understand, why the coefficients from glmnet and caret models based on the same sample and the same hyper-parameters are slightly different. Would greatly appreciate an explanation!
I am using caret to train a ridge regression:
library(ISLR)
Hitters = na.omit(Hitters)
x = model.matrix(Salary ~ ., Hitters)[, -1] #Dropping the intercept column.
y = Hitters$Salary
set.seed(0)
train = sample(1:nrow(x), 7*nrow(x)/10)
library(caret)
set.seed(0)
train_control = trainControl(method = 'cv', number = 10)
grid = 10 ^ seq(5, -2, length = 100)
tune.grid = expand.grid(lambda = grid, alpha = 0)
ridge.caret = train(x[train, ], y[train],
method = 'glmnet',
trControl = train_control,
tuneGrid = tune.grid)
ridge.caret$bestTune
# alpha is 0 and best lambda is 242.0128
Now, I use the lambda (and alpha) found above to train a ridge regression for the whole data set. At the end, I extract the coefficients:
ridge_full <- train(x, y,
method = 'glmnet',
trControl = trainControl(method = 'none'),
tuneGrid = expand.grid(
lambda = ridge.caret$bestTune$lambda, alpha = 0)
)
coef(ridge_full$finalModel, s = ridge.caret$bestTune$lambda)
Finally, using exactly the same alpha and lambda, I try to fit the same ridge regression using glmnet package - and extract coefficients:
library(glmnet)
ridge_full2 = glmnet(x, y, alpha = 0, lambda = ridge.caret$bestTune$lambda)
coef(ridge_full2)
The reason is the fact the exact lambda you specified was not used by caret. You can check this by:
ridge_full$finalModel$lambda
closest values are 261.28915 and 238.07694.
When you do
coef(ridge_full$finalModel, s = ridge.caret$bestTune$lambda)
where s is 242.0128 the coefficients are interpolated from the coefficients actually calculated.
Wheres when you provide lambda to the glmnet call the model returns exact coefficients for that lambda which differ only slightly from the interpolated ones caret returns.
Why this happens:
when you specify one alpha and one lambda for a fit on all of the data caret will actually fit:
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
numLev <- if(is.character(y) | is.factor(y)) length(levels(y)) else NA
theDots <- list(...)
if(all(names(theDots) != "family")) {
if(!is.na(numLev)) {
fam <- ifelse(numLev > 2, "multinomial", "binomial")
} else fam <- "gaussian"
theDots$family <- fam
}
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x,
y = y,
alpha = param$alpha),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
this was taken from here.
in your example this translates to
fit <- glmnet::glmnet(x, y,
alpha = 0)
lambda <- unique(fit$lambda)
these lambda values correspond to ridge_full$finalModel$lambda:
all.equal(lambda, ridge_full$finalModel$lambda)
#output
TRUE

Example of fitting marginal distributions to histogram in R

Could someone show me how to fit a polynomial marginal distribution to my data? I have done a binomial and beta binomial, but I would like to see how to fit a polynomial. I would also be interested in trying a gamma if that is something you know how to do.
This is what I have done so far.
nodes <- read.table("https://web.stanford.edu/~hastie/CASI_files/DATA/nodes.txt",
header = T)
nodes %>%
ggplot(aes(x=x/n))+
geom_histogram(bins = 30)+
theme_bw()+
labs(x = "nodes",
n = "p=x/n")
# log-likelihood function
ll <- function(alpha, beta) {
x <- nodes$x
total <- nodes$n
-sum(VGAM::dbetabinom.ab(x, total, alpha, beta, log = TRUE))
}
# maximum likelihood estimation
m <- mle(ll, start = list(alpha = 1, beta = 10), method = "L-BFGS-B",
lower = c(0.0001, .1))
ab <- coef(m)
alpha0 <- ab[1]
beta0 <- ab[2]
nodes %>%
ggplot() +
geom_histogram(aes(x/n, y = ..density..), bins= 30) +
stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
size = 1) +
xlab("p=x/n")
Here is another fit
ll <- function(a){
x <- nodes$x
total <- nodes$n
-sum(stats::dbinom(x, total, a, log = TRUE))
}
#stats::dbinom()
m <- mle(ll, start = list(a=.5), method = "L-BFGS-B",
lower = c(0.0001, .1))
a = coef(m)
nodes %>%
ggplot() +
geom_histogram(aes(x/n, y = ..density..), bins=40) +
stat_function(fun = function(x) dbeta(x, a, 1), color = "red",
size = 1) +
xlab("proportion x/n")
For fitting a gamma distribution:
data(iris)
library(MASS) ##for the fitdistr function
fit.params <- fitdistr(iris$Sepal.Length, "gamma", lower = c(0, 0))
ggplot(data = iris) +
geom_histogram(data = as.data.frame(x), aes(x=iris$Sepal.Length, y=..density..)) +
geom_line(aes(x=iris$Sepal.Length,
y=dgamma(iris$Sepal.Length,fit.params$estimate["shape"],
fit.params$estimate["rate"])), color="red", size = 1) +
theme_classic()
You might also like to take a look at the distribution of the quantiles using the qqp function in the car package. Here are a few examples:
library(car)
qqp(iris$Sepal.Length, "norm") ##normal distribution
qqp(iris$Sepal.Length, "lnorm") ##log-normal distribution
gamma <- fitdistr(iris$Sepal.Length, "gamma")
qqp(iris$Sepal.Length, "gamma", shape = gamma$estimate[[1]],
rate = gamma$estimate[[2]]) ##gamma distribution
nbinom <- fitdistr(iris$Sepal.Length, "Negative Binomial")
qqp(iris$Sepal.Length, "nbinom", size = nbinom$estimate[[1]],
mu = nbinom$estimate[[2]]) ##negative binomial distribution
You can use the fitdistr function for ggplots or qqPlots. It supports lots of different distributions. Take a look at ?fitdistr

Include weibull fit in ggsurvplot

I would like to fit a weibull curve to some event data and then include the fitted weibull curve in a survival plot plotted by survminer::ggsurvplot. Any ideas of how?
Here is an example to work on:
A function for simulating weibull data:
# N = sample size
# lambda = scale parameter in h0()
# rho = shape parameter in h0()
# beta = fixed effect parameter
# rateC = rate parameter of the exponential distribution of C
simulWeib <- function(N, lambda, rho, beta, rateC)
{
# covariate --> N Bernoulli trials
x <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))
# Weibull latent event times
v <- runif(n=N)
Tlat <- (- log(v) / (lambda * exp(x * beta)))^(1 / rho)
# censoring times
C <- rexp(n=N, rate=rateC)
# follow-up times and event indicators
time <- pmin(Tlat, C)
status <- as.numeric(Tlat <= C)
# data set
data.frame(id=1:N,
time=time,
status=status,
x=x)
}
generate data
set.seed(1234)
betaHat <- rep(NA, 1e3)
for(k in 1:1e3)
{
dat <- simulWeib(N=100, lambda=0.01, rho=1, beta=-0.6, rateC=0.001)
fit <- coxph(Surv(time, status) ~ x, data=dat)
betaHat[k] <- fit$coef
}
#Estimate a survival function
survfit(Surv(as.numeric(time), x)~1, data=dat) -> out0
#plot
library(survminer)
ggsurvplot(out0, data = dat, risk.table = TRUE)
gg1 <- ggsurvplot(
out0, # survfit object with calculated statistics.
data = dat, # data used to fit survival curves.
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test.
conf.int = TRUE, # show confidence intervals for
# point estimaes of survival curves.
xlim = c(0,2000), # present narrower X axis, but not affect
# survival estimates.
break.time.by = 500, # break X axis in time intervals by 500.
ggtheme = theme_minimal(), # customize plot and risk table with a theme.
risk.table.y.text.col = T, # colour risk table text annotations.
risk.table.y.text = FALSE,
surv.median.line = "hv",
color = "darkgreen",
conf.int.fill = "lightblue",
title = "Survival probability",# show bars instead of names in text annotations
# in legend of risk table
)
gg1
As far as I see this, it is not possible do it with ggsurvplot at this moment.
I created an issue requesting this feature: https://github.com/kassambara/survminer/issues/276
You can plot survivor curves of a weibull model with ggplot2 like this:
library("survival")
wbmod <- survreg(Surv(time, status) ~ x, data = dat)
s <- seq(.01, .99, by = .01)
t_0 <- predict(wbmod, newdata = data.frame(x = 0),
type = "quantile", p = s)
t_1 <- predict(wbmod, newdata = data.frame(x = 1),
type = "quantile", p = s)
smod <- data.frame(time = c(t_0, t_1),
surv = rep(1 - s, times = 2),
strata = rep(c(0, 1), each = length(s)),
upper = NA, lower = NA)
head(surv_summary(cm))
library("ggplot2")
ggplot() +
geom_line(data = smod, aes(x = time, y = surv, color = factor(strata))) +
theme_classic()
However to my knowledge you cannot use survminer (yet):
library("survminer")
# wrong:
ggsurvplot(smod)
# does not work:
gg1$plot + geom_line(data = smod, aes(x = time, y = surv, color = factor(strata)))
The following works for me. Probably the credit goes to Heidi filling a feature request.
Hope, someone finds this useful.
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
fKM <- survfit(s ~ sex,data=lung)
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01))
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))

Confidence intervals for predictions from logistic regression

In R predict.lm computes predictions based on the results from linear regression and also offers to compute confidence intervals for these predictions. According to the manual, these intervals are based on the error variance of fitting, but not on the error intervals of the coefficient.
On the other hand predict.glm which computes predictions based on logistic and Poisson regression (amongst a few others) doesn't have an option for confidence intervals. And I even have a hard time imagining how such confidence intervals could be computed to provide a meaningful insight for Poisson and logistic regression.
Are there cases in which it is meaningful to provide confidence intervals for such predictions? How can they be interpreted? And what are the assumptions in these cases?
The usual way is to compute a confidence interval on the scale of the linear predictor, where things will be more normal (Gaussian) and then apply the inverse of the link function to map the confidence interval from the linear predictor scale to the response scale.
To do this you need two things;
call predict() with type = "link", and
call predict() with se.fit = TRUE.
The first produces predictions on the scale of the linear predictor, the second returns the standard errors of the predictions. In pseudo code
## foo <- mtcars[,c("mpg","vs")]; names(foo) <- c("x","y") ## Working example data
mod <- glm(y ~ x, data = foo, family = binomial)
preddata <- with(foo, data.frame(x = seq(min(x), max(x), length = 100)))
preds <- predict(mod, newdata = preddata, type = "link", se.fit = TRUE)
preds is then a list with components fit and se.fit.
The confidence interval on the linear predictor is then
critval <- 1.96 ## approx 95% CI
upr <- preds$fit + (critval * preds$se.fit)
lwr <- preds$fit - (critval * preds$se.fit)
fit <- preds$fit
critval is chosen from a t or z (normal) distribution as required (I forget exactly now which to use for which type of GLM and what the properties are) with the coverage required. The 1.96 is the value of the Gaussian distribution giving 95% coverage:
> qnorm(0.975) ## 0.975 as this is upper tail, 2.5% also in lower tail
[1] 1.959964
Now for fit, upr and lwr we need to apply the inverse of the link function to them.
fit2 <- mod$family$linkinv(fit)
upr2 <- mod$family$linkinv(upr)
lwr2 <- mod$family$linkinv(lwr)
Now you can plot all three and the data.
preddata$lwr <- lwr2
preddata$upr <- upr2
ggplot(data=foo, mapping=aes(x=x,y=y)) + geom_point() +
stat_smooth(method="glm", method.args=list(family=binomial)) +
geom_line(data=preddata, mapping=aes(x=x, y=upr), col="red") +
geom_line(data=preddata, mapping=aes(x=x, y=lwr), col="red")
I stumbled upon Liu WenSui's method that uses bootstrap or simulation approach to solve that problem for Poisson estimates.
Example from the Author
pkgs <- c('doParallel', 'foreach')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 4)
data(AutoCollision, package = "insuranceData")
df <- rbind(AutoCollision, AutoCollision)
mdl <- glm(Claim_Count ~ Age + Vehicle_Use, data = df, family = poisson(link = "log"))
new_fake <- df[1:5, 1:2]
boot_pi <- function(model, pdata, n, p) {
odata <- model$data
lp <- (1 - p) / 2
up <- 1 - lp
set.seed(2016)
seeds <- round(runif(n, 1, 1000), 0)
boot_y <- foreach(i = 1:n, .combine = rbind) %dopar% {
set.seed(seeds[i])
bdata <- odata[sample(seq(nrow(odata)), size = nrow(odata), replace = TRUE), ]
bpred <- predict(update(model, data = bdata), type = "response", newdata = pdata)
rpois(length(bpred), lambda = bpred)
}
boot_ci <- t(apply(boot_y, 2, quantile, c(lp, up)))
return(data.frame(pred = predict(model, newdata = pdata, type = "response"), lower = boot_ci[, 1], upper = boot_ci[, 2]))
}
boot_pi(mdl, new_fake, 1000, 0.95)
sim_pi <- function(model, pdata, n, p) {
odata <- model$data
yhat <- predict(model, type = "response")
lp <- (1 - p) / 2
up <- 1 - lp
set.seed(2016)
seeds <- round(runif(n, 1, 1000), 0)
sim_y <- foreach(i = 1:n, .combine = rbind) %dopar% {
set.seed(seeds[i])
sim_y <- rpois(length(yhat), lambda = yhat)
sdata <- data.frame(y = sim_y, odata[names(model$x)])
refit <- glm(y ~ ., data = sdata, family = poisson)
bpred <- predict(refit, type = "response", newdata = pdata)
rpois(length(bpred),lambda = bpred)
}
sim_ci <- t(apply(sim_y, 2, quantile, c(lp, up)))
return(data.frame(pred = predict(model, newdata = pdata, type = "response"), lower = sim_ci[, 1], upper = sim_ci[, 2]))
}
sim_pi(mdl, new_fake, 1000, 0.95)

Resources