Finding maximum difference between power functions - r

How do I estimate the maximum difference between the power functions?
(power of CLT test and power of binomial test, n=40)
power.z = function(mu,n) {
power = 1 - pnorm( 1.645 - (mu - 75)/(2.5/sqrt(n)) , 0, 1)
return(power)
}
power.bin = function(mu,n) {
p = 1 - pnorm(75, mu, 2.5)
power = 1 - pnorm( 1.645*sqrt(.25/(p*(1-p))) - (p-.5)/sqrt(p*(1-p)/n), 0, 1)
return(power)
}
power.z(75.8, 40) ## 0.6476032
power.bin(75.8, 40) ## 0.4763833

Here is a solution based on #Khashaa's suggestion:
power.z = function(mu,n) {
power = 1 - pnorm( 1.645 - (mu - 75)/(2.5/sqrt(n)) , 0, 1)
return(power)
}
power.bin = function(mu,n) {
p = 1 - pnorm(75, mu, 2.5)
power = 1 - pnorm( 1.645*sqrt(.25/(p*(1-p))) - (p-.5)/sqrt(p*(1-p)/n), 0, 1)
return(power)
}
fnDiffZBin = function(mu, n) abs(power.z(mu, n) - power.bin(mu, n))
optimize(fnDiffZBin, interval = c(75, 80), n = 40, maximum = TRUE)
which gives:
> optimize(fnDiffZBin, interval = c(75, 80), n = 40, maximum = TRUE)
$maximum
[1] 75.68248
$objective
[1] 0.1755956

Related

MCMC for estimating negative binomial distribution

I want to estimate parameters of negative binomial distribution using MCMC Metropolis-Hastings algorithm. In other words, I have sample:
set.seed(42)
y <- rnbinom(20, size = 3, prob = 0.2)
and I want to write algorithm that will estimate parameter of size and parameter of prob.
My work so far
I defined prior distribution of size as Poisson:
prior_r <- function(r) {
return(dpois(r, lambda = 2, log = T))
}
And prior distribution of prob as uniform on [0, 1]:
prior_prob <- function(prob) {
return(dunif(prob, min = 0, max = 1, log = T))
}
Moreover for simplicity I defined loglikelihood and joint probability functions:
loglikelihood <- function(data, r, prob) {
loglikelihoodValue <- sum(dnorm(data, mean = r, sd = prob, log = T))
return(loglikelihoodValue)
}
joint <- function(r, prob) {
data <- y
return(loglikelihood(data, r, prob) + prior_r(r) + prior_prob(prob))
}
Finally, the whole algorithm:
run_mcmc <- function(startvalue, iterations) {
chain <- array(dim = c(iterations + 1, 2))
chain[1, ] <- startvalue
for (i in 1:iterations) {
proposal_r <- rpois(1, lambda = chain[i, 1])
proposal_prob <- chain[i, 2] + runif(1, min = -0.2, max = 0.2)
quotient <- joint(proposal_r, proposal_prob) - joint(chain[i, 1], chain[i, 2])
if (runif(1, 0, 1) < min(1, exp(quotient))) chain[i + 1, ] <- c(proposal_r, proposal_prob)
else chain[i + 1, ] <- chain[i, ]
}
return(chain)
}
The problem
Problem that I'm having is that when I run it with starting values even very close to correct ones:
iterations <- 2000
startvalue <- c(4, 0.25)
res <- run_mcmc(startvalue, iterations)
I'll obtain posterior distribution which is obviously wrong. For example
> colMeans(res)
[1] 11.963018 0.994533
As you can see, size is located very close to point 12, and probability is located in point 1.
Do you know what's the cause of those phenomeons?
Change dnorm in loglikelihood to dnbinom and fix the proposal for prob so it doesn't go outside (0,1):
set.seed(42)
y <- rnbinom(20, size = 3, prob = 0.2)
prior_r <- function(r) {
return(dpois(r, lambda = 2, log = T))
}
prior_prob <- function(prob) {
return(dunif(prob, min = 0, max = 1, log = TRUE))
}
loglikelihood <- function(data, r, prob) {
loglikelihoodValue <- sum(dnbinom(data, size = r, prob = prob, log = TRUE))
return(loglikelihoodValue)
}
joint <- function(r, prob) {
return(loglikelihood(y, r, prob) + prior_r(r) + prior_prob(prob))
}
run_mcmc <- function(startvalue, iterations) {
chain <- array(dim = c(iterations + 1, 2))
chain[1, ] <- startvalue
for (i in 1:iterations) {
proposal_r <- rpois(1, lambda = chain[i, 1])
proposal_prob <- chain[i, 2] + runif(1, min = max(-0.2, -chain[i,2]), max = min(0.2, 1 - chain[i,2]))
quotient <- joint(proposal_r, proposal_prob) - joint(chain[i, 1], chain[i, 2])
if (runif(1, 0, 1) < min(1, exp(quotient))) {
chain[i + 1, ] <- c(proposal_r, proposal_prob)
} else {
chain[i + 1, ] <- chain[i, ]
}
}
return(chain)
}
iterations <- 2000
startvalue <- c(4, 0.25)
res <- run_mcmc(startvalue, iterations)
colMeans(res)
#> [1] 3.1009495 0.1988177

R code for simulating stochastic asset price path

Consider the following model for the evolution of an asset's price:
This what I have done (in R). I could not find a function that randomly outputs +1 or -1, so I decided to adapt the inbuilt rbinom function.
## This code is in R
rm(list = ls())
library(dplyr)
library(dint)
library(magrittr)
library(stats)
path =
function(T, mu, sigma, p, x0) {
x = rep(NA, T)
x[1] = x0
for(i in 2:T){
z = if_else(rbinom(1,1,p) == 0, -1, 1)
x[i] = x[i-1] * exp(mu + sigma*z)
}
return(x)
}
## Just some testing
x_sim = path(T = 4, mu = 0, sigma = 0.01, p = 0.5, x0 = 100)
## Actual answer
Np = 10000
mc = matrix(nrow = 17, ncol = Np)
for(j in 1:Np){
mc[,j] = path(T = 17, mu = 0, sigma = 0.01, p = 0.5, x0 = 100)
}
test = mc[2:nrow(mc), ] >= 100
sum_test = colSums(test)
comp = sum(sum_test >= 1)/length(sum_test)
prob = 1 - comp
Does this make sense? Any help/tips/advice would be much appreciated. Thanks!
Staying close to your code, I came up with this. Intuitively, if you think about it, the probability should be rather low due to the parameters and I get a probability of about 6.7% which is roughly what I get if I run your code with the parameters from the assignment.
simpath <- function(t, mu, sigma, p, x0, seed){
# set seed
if(!missing(seed)){
set.seed(seed)
}
# set up matrix for storing the results
res <- matrix(c(1:t, rep(NA, t*2)), ncol = 3)
colnames(res) <- c('t', 'z_t', 'x_t')
res[, 'z_t'] <- sample(c(1, -1), size = t, prob = c(p, 1-p), replace = TRUE)
res[1, 3] <- x0
for(i in 2:t){
res[i, 3] <- res[i-1, 3] * exp(mu+sigma*res[i, 2])
}
return(res)
}
x_sim <- simpath(t = 4, mu = 0, sigma = 0.01, p = 0.5, x0 = 100, seed = 123)
x_sim2 <- simpath(t = 36, mu = 0, sigma = 0.03, p = 0.5, x0 = 100, seed = 123)
## Actual answer
Np <- 100000
mc <- matrix(nrow = 36, ncol = Np)
for (j in 1:Np){
mc[, j] <- simpath(t = 36, mu = 0, sigma = 0.03, p = 0.5, x0 = 100)[, 3]
}
test <- mc > 100
sum_test <- colSums(test)
comp = sum(sum_test == 0)/length(sum_test)
prob = comp
> prob
[1] 0.06759

Data generating for the functional regression problem

Have a problem with data generating and I have no idea how to solve this. All information provided in photo: Problem.
I think that X_i(t) in both cases should be 200 x 100 if we say that t is from 0 to 1 (length = 100). Furthermore, coefficients for polynomial should contain 200 x 4 and coefficients for fourier should contain 200 x 5. Bu I have no idea how to start to solve this problem.
Here is some code. So, I have already defined my beta's, but I can't defeat generating of X_i(t).
t <- seq(0, 1, length = 100)
beta_1t <- rep(0, 100)
plot(t, beta_1t, type = "l")
beta_2t <- (t >= 0 & t < 0.342) * ((t - 0.5)^2 - 0.025) +
(t >= 0.342 & t <= 0.658) * 0 +
(t > 0.658 & t <= 1) * (-(t - 0.5)^2 + 0.025)
plot(t, beta_2t, type = "l")
beta_3t <- t^3 - 1.6 * t^2 + 0.76 * t + 1
plot(t, beta_3t, type = "l")
poly_c <- matrix(rnorm(n = 800, mean = 0, sd = 1), ncol = 4)
four_c <- matrix(rnorm(n = 1000, mean = 0, sd = 1), ncol = 5)
As I mentioned before, there should be (X_i(t), Y_i(t)) samples. Here i = 1, 2, ..., 200; t from [0, 1] (length = 100).

My P-values are way lower than I expected and can not build a proper power curve

pval.dist.sim = function(n, sigma_x, rho, reps = 2500){
p = 5; sigma = sqrt(2)
beta = c(0.5, 0.5, 0, 0.25, 0)
mu = 10
# generate vector for pvals
pval.list = numeric(reps)
for(r in 1:reps){
# generate design matrix
X = gen_X(n = n, p = 5, rho = rho, sigma_x = sigma_x, mu = mu)
# generate the XtXinv portion of equation
XtXinv = qr.solve(crossprod(X))
sqrtXtXinv55 = sqrt(XtXinv[5,5])
y = X %*% beta + rnorm(n = n)
beta.hat = XtXinv %*% crossprod(X, y)
sE = sqrt(sum((y - X %*% beta.hat)^2)/(n-p))
t.val = beta.hat[3]/(sE * sqrtXtXinv55)
pval.list[r] = 2 * pt(-abs(t.val), df = n - p)
}
return(pval.list)
}
Above is the pval.dist simulation. I need to run this function to build my p.values to build my power curve
set.seed(3701)
# givens
p = 5; d = 2; mu = 10; sigmasqrd = 2; reps = 2500
n.list = seq(from=10, to=150, by=10)
# create a vector for the estimates of the power
est.power = numeric(length(n.list))
# create a vector for the left endpoints of the 95% CI
LB.list = numeric(length(n.list))
# create a vector for the right endpoints of the 95% CI
UB.list = numeric(length(n.list))
for(j in 1:length(n.list)){
# perform the test reps times
pvals = pval.dist.sim(n = n.list[j], sigma_x = 1.5, rho = 0.2, reps = reps )
# record the simulated estimate of the power
est.power[j] = mean(pvals<0.05)
# compute the 95% conf int
bounds = binom.test(x=sum(pvals < 0.05), n = reps, conf.level = 0.95)$conf.int[1:2]
LB.list[j] = bounds[1]
UB.list[j] = bounds[2]
}
## plot the power curve estimation
plot(n.list, est.power, t = "l", xlab = "n",ylab = "Power")
I am having the issue that my pvalues, when plugged in, are drastically low. I am getting values in the single digit percentage. What am I doing wrong?

TrueSkill Implementation in R

I am trying to program the basic Vanilla TrueSkill (3.1) algorithm in R but am getting some strange results.
My code is the following:
# A simple test between two players repeatedly laying one another
betaSq = 0.1
obs = 10000
p1_skills = 0.333
p2_skills = 0
p1_draws = rnorm(obs, p1_skills, sqrt(betaSq))
p2_draws = rnorm(obs, p2_skills, sqrt(betaSq))
p1_pred_mu = rep(NA, obs+1)
p1_pred_sigmaSq = rep(NA, obs+1)
p2_pred_mu = rep(NA, obs+1)
p2_pred_sigmaSq = rep(NA, obs+1)
# Initial values
p1_pred_mu[1] = 0
p1_pred_sigmaSq[1] = 1
p2_pred_mu[1] = 0
p2_pred_sigmaSq[1] = 1
results = p1_draws > p2_draws
probs = rep(NA, obs)
# Run TrueSkill
for (i in seq(2,obs+1)) {
probs[i-1] = predictProb(p1_pred_mu[i-1], p1_pred_sigmaSq[i-1], p2_pred_mu[i-1], p2_pred_sigmaSq[i-1], betaSq)
out = updateSkill(p1_pred_mu[i-1], p1_pred_sigmaSq[i-1], p2_pred_mu[i-1], p2_pred_sigmaSq[i-1], betaSq, results[i-1])
# Now update based on the out
p1_pred_mu[i] = out$mu1
p1_pred_sigmaSq[i] = out$sigmaSq1
p2_pred_mu[i] = out$mu2
p2_pred_sigmaSq[i] = out$sigmaSq2
}
# Output results
dev.new()
mu = p1_pred_mu
lower = qnorm(0.05, p1_pred_mu, p1_pred_sigmaSq)
upper = qnorm(0.95, p1_pred_mu, p1_pred_sigmaSq)
plot(mu, ylim = c(min(lower), max(upper)), main = "p1")
lines(lower)
lines(upper)
dev.new()
mu = p2_pred_mu
lower = qnorm(0.05, p2_pred_mu, p2_pred_sigmaSq)
upper = qnorm(0.95, p2_pred_mu, p2_pred_sigmaSq)
plot(mu, ylim = c(min(lower), max(upper)), main = "p2")
lines(lower)
lines(upper)
a = filter(probs, rep(1, 20))/20
dev.new()
plot(a)
print(sprintf("Mean p1: %g", mean(p1_pred_mu)))
print(sprintf("Mean p2: %g", mean(p2_pred_mu)))
print(sprintf("Mean results: %g", mean(results)))
print(sprintf("Mean predicted results: %g", mean(probs)))
The functions that are called are:
# Functions
updateSkill <- function(mu1, sigmaSq1, mu2, sigmaSq2, betaSq, result) {
# http://papers.nips.cc/paper/3331-trueskill-through-time-revisiting-the-history-of-chess.pdf
c = 2*betaSq + sigmaSq1 + sigmaSq2
if (result == 1) {
# Player 1 wins
v = dnorm((mu1-mu2)/c)/pnorm((mu1-mu2)/c)
w = v*(v+(mu1-mu2)/c)
mu1 = mu1 + (sigmaSq1/c)*v
mu2 = mu2 - (sigmaSq2/c)*v
sigmaSq1 = sigmaSq1 * sqrt(1 - (sigmaSq1/c^2)*w)
sigmaSq2 = sigmaSq2 * sqrt(1 - (sigmaSq2/c^2)*w)
} else if (result == 0) {
# Player 2 wins
v = dnorm((mu2-mu1)/c)/pnorm((mu2-mu1)/c)
w = v*(v+(mu2-mu1)/c)
mu1 = mu1 - (sigmaSq1/c)*v
mu2 = mu2 + (sigmaSq2/c)*v
sigmaSq1 = sigmaSq1 * sqrt(1 - (sigmaSq1/c^2)*w)
sigmaSq2 = sigmaSq2 * sqrt(1 - (sigmaSq2/c^2)*w)
}
return(list(mu1=mu1, mu2=mu2, sigmaSq1=sigmaSq1, sigmaSq2=sigmaSq2))
}
predictProb <- function(mu1, sigmaSq1, mu2, sigmaSq2, betaSq) {
# Try to predict the probability of player 1 beating player 2 using Trueskill model
mean1 = mu1
mean2 = mu2
var1 = sigmaSq1 + betaSq
var2 = sigmaSq2 + betaSq
# Now the dist of player1 - player2 is N(mean1 - mean2, sqrt(var1 + var2))
prob1Wins = pnorm(0, mean2 - mean1, sqrt(var1 + var2))
return(prob1Wins)
}
I hate to post the massive code blob but I really cannot figure out where things are going wrong.
This program runs and the predicted skills (distributed to a N(mu, sigma)) converge. However the predicted probabilities they are giving are not converging to the true probabilities for the results!
A sample output is:
[1] "Mean p1: 0.0762161"
[1] "Mean p2: -0.0762161"
[1] "Mean results: 0.7733"
[1] "Mean predicted results: 0.631424"
Any idea what is going wrong?
The reason this didn't work is because in the 3rd line of the updateSkills function it should read
c = sqrt(2*betaSq + sigmaSq1 + sigmaSq2)
not
c = 2*betaSq + sigmaSq1 + sigmaSq2

Resources