Related
Good Morning, please I need community help in order to understand some problems that occurred writing this model.
I aim at modeling causes of death proportion using as predictors "log_GDP" (Gross domestic product in log scale), and "log_h" (hospital beds per 1,000 people on log scale)
y: 3 columns that are observed proportions of deaths over the years.
x1: "log_GDP" (Gross domestic product in log scale)
x2: "log_h" (hospital beds per 1,000 people in log scale)
As you can see from the estimation result in the last plot, I got a high noise level. Where I worked using just one covariate i.e. log_GDP, I obtained smoothed results
Here the model specification:
Here simulated data:
library(reshape2)
library(tidyverse)
library(ggplot2)
library(runjags)
CIRC <- c(0.3685287, 0.3675516, 0.3567829, 0.3517274, 0.3448940, 0.3391031, 0.3320184, 0.3268640,
0.3227445, 0.3156360, 0.3138515,0.3084506, 0.3053657, 0.3061224, 0.3051044)
NEOP <- c(0.3602199, 0.3567355, 0.3599409, 0.3591258, 0.3544591, 0.3566269, 0.3510974, 0.3536156,
0.3532980, 0.3460948, 0.3476183, 0.3475634, 0.3426035, 0.3352433, 0.3266048)
OTHER <-c(0.2712514, 0.2757129, 0.2832762, 0.2891468, 0.3006468, 0.3042701, 0.3168842, 0.3195204,
0.3239575, 0.3382691, 0.3385302, 0.3439860, 0.3520308, 0.3586342, 0.3682908)
log_h <- c(1.280934, 1.249902, 1.244155, 1.220830, 1.202972, 1.181727, 1.163151, 1.156881, 1.144223,
1.141033, 1.124930, 1.115142, 1.088562, 1.075002, 1.061257)
log_GDP <- c(29.89597, 29.95853, 29.99016, 30.02312, 30.06973, 30.13358, 30.19878, 30.25675, 30.30184,
30.31974, 30.30164, 30.33854, 30.37460, 30.41585, 30.45150)
D <- data.frame(CIRC=CIRC, NEOP=NEOP, OTHER=OTHER,
log_h=log_h, log_GDP=log_GDP)
cause.y <- as.matrix((data.frame(D[,1],D[,2],D[,3])))
cause.y <- cause.y/rowSums(cause.y)
mat.x<- D$log_GDP
mat.x2 <- D$log_h
n <- 15
Jags Model
dirlichet.model = "
model {
#setup priors for each species
for(j in 1:N.spp){
m0[j] ~ dnorm(0, 1.0E-3) #intercept prior
m1[j] ~ dnorm(0, 1.0E-3) # mat.x prior
m2[j] ~ dnorm(0, 1.0E-3)
}
#implement dirlichet
for(i in 1:N){
y[i,1:N.spp] ~ ddirch(a0[i,1:N.spp])
for(j in 1:N.spp){
log(a0[i,j]) <- m0[j] + m1[j] * mat.x[i]+ m2[j] * mat.x2[i] # m0 = intercept; m1= coeff log_GDP; m2= coeff log_h
}
}} #close model loop.
"
jags.data <- list(y = cause.y,mat.x= mat.x,mat.x2= mat.x2, N = nrow(cause.y), N.spp = ncol(cause.y))
jags.out <- run.jags(dirlichet.model,
data=jags.data,
adapt = 5000,
burnin = 5000,
sample = 10000,
n.chains=3,
monitor=c('m0','m1','m2'))
out <- summary(jags.out)
head(out)
Gather coefficient and I make estimation of proportions
coeff <- out[c(1,2,3,4,5,6,7,8,9),4]
coef1 <- out[c(1,4,7),4] #coeff (interc and slope) caus 1
coef2 <- out[c(2,5,8),4] #coeff (interc and slope) caus 2
coef3 <- out[c(3,6,9),4] #coeff (interc and slope) caus 3
pred <- as.matrix(cbind(exp(coef1[1]+coef1[2]*mat.x+coef1[3]*mat.x2),
exp(coef2[1]+coef2[2]*mat.x+coef2[3]*mat.x2),
exp(coef3[1]+coef3[2]*mat.x+coef3[3]*mat.x2)))
pred <- pred / rowSums(pred)
Predicted and Obs. values DB
Obs <- data.frame(Circ=cause.y[,1],
Neop=cause.y[,2],
Other=cause.y[,3],
log_GDP=mat.x,
log_h=mat.x2)
Obs$model <- "Obs"
Pred <- data.frame(Circ=pred[,1],
Neop=pred[,2],
Other=pred[,3],
log_GDP=mat.x,
log_h=mat.x2)
Pred$model <- "Pred"
tot60<-as.data.frame(rbind(Obs,Pred))
tot <- melt(tot60,id=c("log_GDP","log_h","model"))
tot$variable <- as.factor(tot$variable)
Plot
tot %>%filter(model=="Obs") %>% ggplot(aes(log_GDP,value))+geom_point()+
geom_line(data = tot %>%
filter(model=="Pred"))+facet_wrap(.~variable,scales = "free")
The problem for the non-smoothness is that you are calculating Pr(y=m|X) = f(x1, x2) - that is the predicted probability is a function of x1 and x2. Then you are plotting Pr(y=m|X) as a function of a single x variable - log of GDP. That result will almost certainly not be smooth. The log_GDP and log_h variables are highly negatively correlated which is why the result is not much more variable than it is.
In my run of the model, the average coefficient for log_GDP is actually positive for NEOP and Other, suggesting that the result you see in the plot is quite misleading. If you were to plot these in two dimensions, you would see that the result is again, smooth.
mx1 <- seq(min(mat.x), max(mat.x), length=25)
mx2 <- seq(min(mat.x2), max(mat.x2), length=25)
eg <- expand.grid(mx1 = mx1, mx2 = mx2)
pred <- as.matrix(cbind(exp(coef1[1]+coef1[2]*eg$mx1 + coef1[3]*eg$mx2),
exp(coef2[1]+coef2[2]*eg$mx1 + coef2[3]*eg$mx2),
exp(coef3[1]+coef3[2]*eg$mx1 + coef3[3]*eg$mx2)))
pred <- pred / rowSums(pred)
Pred <- data.frame(Circ=pred[,1],
Neop=pred[,2],
Other=pred[,3],
log_GDP=mx1,
log_h=mx2)
lattice::wireframe(Neop ~ log_GDP + log_h, data=Pred, drape=TRUE)
A couple of other things to watch out for.
Usually in hierarchical Bayesian models, your the parameters of your coefficients would themselves be distributions with hyperparameters. This enables shrinkage of the coefficients toward the global mean which is a hallmark of hierarhical models.
Not sure if this is what your data really look like or not, but the correlation between the two independent variables is going to make it difficult for the model to converge. You could try using a multivariate normal distribution for the coefficients - that might help.
I need to show that the amount of events in Poisson process are distributed by Poisson distribution with parameter lambda * t.
Here is the Poisson process generator:
ppGen <- function(lambda, maxTime){
taos <- taosGen(lambda, maxTime)
pp <- NULL
for(i in 1:maxTime){
pp[i] <- sum(taos <= i)
}
return(pp)
}
Here I try to replicate the process 1000 times and vectorisee the total occurrences in each realisation:
d <- ppGen(0.5,100)
tail(d,n=1)
reps <- 1000
x1 <- replicate(reps, tail(ppGen(0.5,100), n=1))
hist(x1)
Here is the histogram:
Here I am trying to draw a theoretical Poisson density curve with parameter lambda * t:
xfit<-seq(1,100,length=100)
yfit<-dpois(xfit,lambda = 0.5*100)
lines(xfit,yfit)
But the curve doesn't appear anywhere near the histogram. Can anyone suggest on the right way to do this?
Maybe you can try curve like below
x <- rpois(1000, 0.5 * 100)
dp <- function(x, lbd = 0.5 * 100) dpois(x, lambda = lbd)
curve(dp, 0, 100)
hist(x, freq = FALSE, add = TRUE)
I am trying to fit a exponentially modified gaussian (like in https://en.wikipedia.org/wiki/Exponentially_modified_Gaussian_distribution equation (1)) to my 2D (x, y) data in R.
My data are:
x <- c(1.13669371604919, 1.14107275009155, 1.14545404911041, 1.14983117580414,
1.15421032905579, 1.15859162807465, 1.16296875476837, 1.16734790802002,
1.17172694206238, 1.17610621452332, 1.18048334121704, 1.18486452102661,
1.18924164772034, 1.19362080097198, 1.19800209999084, 1.20237922668457,
1.20675826072693, 1.21113955974579, 1.21551668643951, 1.21989583969116,
1.22427713871002, 1.22865414619446, 1.2330334186554, 1.23741245269775,
1.24178957939148, 1.24616885185242, 1.25055003166199, 1.25492715835571,
1.25930631160736, 1.26368761062622, 1.26806473731995, 1.2724437713623
)
y <- c(42384.03125, 65262.62890625, 235535.828125, 758616, 1691651.75,
3956937.25, 8939261, 20311304, 41061724, 65143896, 72517440,
96397368, 93956264, 87773568, 82922064, 67289832, 52540768, 50410896,
35995212, 27459486, 14173627, 12645145, 10069048, 4290783.5,
2999174.5, 2759047.5, 1610762.625, 1514802, 958150.6875, 593638.6875,
368925.8125, 172826.921875)
The function I am trying to fit and the value I am trying to minimize for optimization:
EMGCurve <- function(x, par)
{
ta <- 1/par[1]
mu <- par[2]
si <- par[3]
h <- par[4]
Fct.V <- (h * si / ta) * (pi/2)^0.5 * exp(0.5 * (si / ta)^2 - (x - mu)/ta)
Fct.V
}
RMSE <- function(par)
{
Fct.V <- EMGCurve(x,par)
sqrt(sum((signal - Fct.V)^2)/length(signal))
}
result <- optim(c(1, x[which.max(y)], unname(quantile(x)[4]-quantile(x)[2]), max(y)),
lower = c(1, min(x), 0.0001, 0.1*max(y)),
upper = c(Inf, max(x), 0.5*(max(x) - min(x)), max(y)),
RMSE, method="L-BFGS-B", control=list(factr=1e7))
However, when I try to vizualize the result in the end it seems like nothing usful is happening,..
plot(x,y,xlab="RT/min",ylab="I")
lines(seq(min(x),max(x),length=1000),GaussCurve(seq(min(x),max(x),length=1000),result$par),col=2)
However, for some reason it doesn't work at all, although a managed to do it for a normal distribution with similar code. Would be great if someone has an idea?
If it might be of some use, I got an OK fit to your data using an X-shifted log-normal type peak equation, "y = a * exp(-0.5 * pow((log(x-d)-b) / c, 2.0))" with parameters a = 9.4159743234392539E+07, b = -2.7516932481669185E+00, c = -2.4343893243720971E-01, and d = 1.1251623071481867E+00 yielding R-squared = 0.994 and RMSE = 2.49E06. I personally was unable to fit using the equation in your post. There may be value in scaling the dependent data as the values seem large, but this equation seems to fit the data as is.
I am trying to solve for the parameters of a gamma distribution that is convolved with both normal and lognormal distributions. I can experimentally derive parameters for both the normal and lognormal components, hence, I just want to solve for the gamma params.
I have attempted 3 approaches to this problem:
1) generating convolved random datasets (i.e. rnorm()+rlnorm()+rgamma()) and using least-squares regression on the linear- or log-binned histograms of the data (not shown, but was very biased by RNG and didn't optimize well at all.)
2) "brute-force" numerical integration of the convolving functions (example code #1)
3) numerical integration approaches w/ the distr package. (example code #2)
I have had limited success with all three approaches. Importantly, these approaches seem to work well for "nominal" values for the gamma parameters, but they all begin to fail when k(shape) is low and theta(scale) is high—which is where my experimental data resides. please find the examples below.
Straight-up numerical Integration
# make the functions
f.N <- function(n) dnorm(n, N[1], N[2])
f.L <- function(l) dlnorm(l, L[1], L[2])
f.G <- function(g) dgamma(g, G[1], scale=G[2])
# make convolved functions
f.Z <- function(z) integrate(function(x,z) f.L(z-x)*f.N(x), -Inf, Inf, z)$value # L+N
f.Z <- Vectorize(f.Z)
f.Z1 <- function(z) integrate(function(x,z) f.G(z-x)*f.Z(x), -Inf, Inf, z)$value # G+(L+N)
f.Z1 <- Vectorize(f.Z1)
# params of Norm, Lnorm, and Gamma
N <- c(0,5)
L <- c(2.5,.5)
G <- c(2,7) # this distribution is the one we ultimately want to solve for.
# G <- c(.5,10) # 0<k<1
# G <- c(.25,5e4) # ballpark params of experimental data
# generate some data
set.seed(1)
rN <- rnorm(1e4, N[1], N[2])
rL <- rlnorm(1e4, L[1], L[2])
rG <- rgamma(1e4, G[1], scale=G[2])
Z <- rN + rL
Z1 <- rN + rL + rG
# check the fit
hist(Z,freq=F,breaks=100, xlim=c(-10,50), col=rgb(0,0,1,.25))
hist(Z1,freq=F,breaks=100, xlim=c(-10,50), col=rgb(1,0,0,.25), add=T)
z <- seq(-10,50,1)
lines(z,f.Z(z),lty=2,col="blue", lwd=2) # looks great... convolution performs as expected.
lines(z,f.Z1(z),lty=2,col="red", lwd=2) # this works perfectly so long as k(shape)>=1
# I'm guessing the failure to compute when shape 0 < k < 1 is due to
# numerical integration problems, but I don't know how to fix it.
integrate(dgamma, -Inf, Inf, shape=1, scale=1) # ==1
integrate(dgamma, 0, Inf, shape=1, scale=1) # ==1
integrate(dgamma, -Inf, Inf, shape=.5, scale=1) # !=1
integrate(dgamma, 0, Inf, shape=.5, scale=1) # != 1
# Let's try to estimate gamma anyway, supposing k>=1
optimFUN <- function(par, N, L) {
print(par)
-sum(log(f.Z1(Z1[1:4e2])))
}
f.G <- function(g) dgamma(g, par[1], scale=par[2])
fitresult <- optim(c(1.6,5), optimFUN, N=N, L=L)
par <- fitresult$par
lines(z,f.Z1(z),lty=2,col="green3", lwd=2) # not so great... likely better w/ more data,
# but it is SUPER slow and I observe large step sizes.
Attempting convolving via distr package
# params of Norm, Lnorm, and Gamma
N <- c(0,5)
L <- c(2.5,.5)
G <- c(2,7) # this distribution is the one we ultimately want to solve for.
# G <- c(.5,10) # 0<k<1
# G <- c(.25,5e4) # ballpark params of experimental data
# make the distributions and "convolvings'
dN <- Norm(N[1], N[2])
dL <- Lnorm(L[1], L[2])
dG <- Gammad(G[1], G[2])
d.NL <- d(convpow(dN+dL,1))
d.NLG <- d(convpow(dN+dL+dG,1)) # for large values of theta, no matter how I change
# getdistrOption("DefaultNrFFTGridPointsExponent"), grid size is always wrong.
# Generate some data
set.seed(1)
rN <- r(dN)(1e4)
rL <- r(dL)(1e4)
rG <- r(dG)(1e4)
r.NL <- rN + rL
r.NLG <- rN + rL + rG
# check the fit
hist(r.NL, freq=F, breaks=100, xlim=c(-10,50), col=rgb(0,0,1,.25))
hist(r.NLG, freq=F, breaks=100, xlim=c(-10,50), col=rgb(1,0,0,.25), add=T)
z <- seq(-10,50,1)
lines(z,d.NL(z), lty=2, col="blue", lwd=2) # looks great... convolution performs as expected.
lines(z,d.NLG(z), lty=2, col="red", lwd=2) # this appears to work perfectly
# for most values of K and low values of theta
# this is looking a lot more promising... how about estimating gamma params?
optimFUN <- function(par, dN, dL) {
tG <- Gammad(par[1],par[2])
d.NLG <- d(convpow(dN+dL+tG,1))
p <- d.NLG(r.NLG)
p[p==0] <- 1e-15 # because sometimes very low probabilities evaluate to 0...
# ...and logs don't like that.
-sum(log(p))
}
fitresult <- optim(c(1,1e4), optimFUN, dN=dN, dL=dL)
fdG <- Gammad(fitresult$par[1], fitresult$par[2])
fd.NLG <- d(convpow(dN+dL+fdG,1))
lines(z,fd.NLG(z), lty=2, col="green3", lwd=2) ## this works perfectly when ~k>1 & ~theta<100... but throws
## "Error in validityMethod(object) : shape has to be positive" when k decreases and/or theta increases
## (boundary subject to RNG).
Can i speed up the integration in example 1? can I increase the grid size in example 2 (distr package)? how can I address the k<1 problem? can I rescale the data in a way that will better facilitate evaluation at high theta values?
Is there a better way all-together?
Help!
Well, convolution of function with gaussian kernel calls for use of Gauss–Hermite quadrature. In R it is implemented in special package: https://cran.r-project.org/web/packages/gaussquad/gaussquad.pdf
UPDATE
For convolution with Gamma distribution this package might be useful as well via Gauss-Laguerre quadrature
UPDATE II
Here is quick code to convolute gaussian with lognormal,
hopefully not a lot of bugs and and prints some reasonable looking graph
library(gaussquad)
n.quad <- 170 # integration order
# get the particular weights/abscissas as data frame with 2 observables and n.quad observations
rule <- ghermite.h.quadrature.rules(n.quad, mu = 0.0)[[n.quad]]
# test function - integrate 1 over exp(-x^2) from -Inf to Inf
# should get sqrt(pi) as an answer
f <- function(x) {
1.0
}
q <- ghermite.h.quadrature(f, rule)
print(q - sqrt(pi))
# convolution of lognormal with gaussian
# because of the G-H rules, we have to make our own function
# for simplicity, sigmas are one and mus are zero
sqrt2 <- sqrt(2.0)
c.LG <- function(z) {
#print(z)
f.LG <- function(x) {
t <- (z - x*sqrt2)
q <- 0.0
if (t > 0.0) {
l <- log(t)
q <- exp( - 0.5*l*l ) / t
}
q
}
ghermite.h.quadrature(Vectorize(f.LG), rule) / (pi*sqrt2)
}
library(ggplot2)
p <- ggplot(data = data.frame(x = 0), mapping = aes(x = x))
p <- p + stat_function(fun = Vectorize(c.LG))
p <- p + xlim(-1.0, 5.0)
print(p)
I am trying to modify Kyle Gorman's autoloess function to be callable as a method in ggplot2's stat_smooth. autoloess is a simply wrapper which runs loess through an optimiser to find the value of span which minimises AICc.
I have created something which runs successfully, but only by using a global variable. Is there a more elegant, idomatic way of programming this?
My code:
AICc.loess <- function(fit) {
# compute AIC_C for a LOESS fit, from:
#
# Hurvich, C.M., Simonoff, J.S., and Tsai, C. L. 1998. Smoothing
# parameter selection in nonparametric regression using an improved
# Akaike Information Criterion. Journal of the Royal Statistical
# Society B 60: 271–293.
#
# #param fit loess fit
# #return 'aicc' value
stopifnot(inherits(fit, 'loess'))
# parameters
n <- fit$n
trace <- fit$trace.hat
sigma2 <- sum(resid(fit) ^ 2) / (n - 1)
return(log(sigma2) + 1 + 2 * (2 * (trace + 1)) / (n - trace - 2))
}
.autoloess.magic.w <- NULL
autoloess <- function(formula, data, weights, span=c(0.01, 2.0)) {
.autoloess.magic.w <- ~weights
fit <- loess(formula=formula,
data=data,
weights=.autoloess.magic.w)
stopifnot(length(span) == 2)
# loss function in form to be used by optimize
f <- function(span) AICc.loess(update(fit, span=span))
# find best loess according to loss function
res <- update(fit, span=optimize(f, span)$minimum)
cat(paste("Optimal span:", res$pars$span, "\n"))
return(res)
}
And a quick test:
# Test
library(ggplot2)
set.seed(1984)
# Create a cubic curve
df <- data.frame(x=1:2500, y=500000 +
(-1000*(1:2500)) +
((1:2500)^2) +
-0.00025*((1:2500)^3) +
rnorm(2500, sd=60000),
ww=runif(2500, min=0, max=10))
# Use loess span
ggplot(df, aes(x=x, y=y, weight=ww)) + geom_point() + stat_smooth(method="loess")
# Use autoloess
ggplot(df, aes(x=x, y=y, weight=ww)) + geom_point() + stat_smooth(method="autoloess")
You can use the weight variable (seems like it is there when the function is called):
autoloess <- function(formula, data, weights, span=c(0.01, 2.0)) {
fit <- loess(formula = formula,
data = data,
weights=weight)
stopifnot(length(span) == 2)
# loss function in form to be used by optimize
f <- function(span) AICc.loess(update(fit, span=span))
# find best loess according to loss function
res <- update(fit, span=optimize(f, span)$minimum)
cat(paste("Optimal span:", res$pars$span, "\n"))
return(res)
}