Adding confidence intervals to plot from simulation data in R - r

I've created a probit simulation based on a likelihood function and simulation, all of which can be replicated with the code below.
This is the likelihood function:
probit.ll <- function(par,ytilde,x) {
a <- par[1]
b <- par[2]
return( -sum( pnorm(ytilde*(a + b*x),log=TRUE) ))
}
This is the function to do the estimates:
my.probit <- function(y,x) {
# use OLS to get start values
par <- lm(y~x)$coefficients
ytilde <- 2*y-1
# Run optim
res <- optim(par,probit.ll,hessian=TRUE,ytilde=ytilde,x=x)
# Return point estimates and SE based on the inverse of Hessian
names(res$par) <- c('a','b')
se=sqrt(diag(solve(res$hessian)))
names(se) <- c('a','b')
return(list(par=res$par,se=se,cov=solve(res$hessian)))
}
And this is the function to generate the simulated model:
probit.data <- function(N=100,a=1,b=1) {
x <- rnorm(N)
y.star <- a + b*x + rnorm(N)
y <- (y.star > 0)
return( as.data.frame(cbind(y,x,y.star)) )
}
This simulates an n size equal 100:
probit.data100 <- function(N=100,a=2,b=1) {
x <- rnorm(N)
y.star <- a + b*x + rnorm(N)
y <- (y.star > 0)
return( as.data.frame(cbind(y,x,y.star)) )
}
#predicted value
se.probit.phat100 <- function(x, par, V) {
z <- par[1] + par[2] * x
# Derivative of q w.r.t. alpha and beta
J <- c( dnorm(z), dnorm(z)*par[2] )
return( sqrt(t(J) %*% V %*% J) )
}
dat100 <- probit.data100()
res100 <- my.probit(dat100$y,dat100$x)
res100
This function below will calculate the confidence intervals based on a non-parametric bootstrap approach (notice the sample function being used):
N <- dim(probit.data(N=100, a=1, b=1))[1]
npb.par <- matrix(NA,100,2)
colnames(npb.par) <- c("alpha","beta")
npb.eystar <- matrix(NA,100,N)
for (t in 1:100) {
thisdta <- probit.data(N=100, a=1, b=1)[sample(1:N,N,replace=TRUE),]
npb.par[t,] <- my.probit(thisdta$y,thisdta$x)$par
}
This function below just cleans up the bootstrap output, and the confidence intervals are what I would like to plot:
processres <- function(simres) {
z <- t(apply(simres,2,function(x) { c(mean(x),median(x),sd(x),quantile(x,c(0.05,0.95))) } ))
rownames(z) <- colnames(simres)
colnames(z) <- c("mean","median","sd","5%","95%")
z
}
processres(npb.par)
I would like to plot a graph like this (the one below), but add confidence intervals based on the processres function above. How can these confidence intervals be added to the plot?
x <- seq(-5,5,length=100)
plot(x, pnorm(1 - 0.5*x), ty='l', lwd=2, bty='n', xlab='x', ylab="Pr(y=1)")
rug(dat100$x)
I'm also open to a different plot code and/or package. I just want a graph based on this simulation with added confidence intervals.
Thanks!

Here's a way to add a shaded CI based on simulation results:
UPDATE: this now plots the expected curve (i.e. using mean alpha & beta values), and correctly passes these means to rnorm.
x <- seq(-5,5,length=100)
plot(x, pnorm(1 - 0.5*x), ty='n', lwd=2, bty='n', xlab='x', ylab="Pr(y=1)",
xaxs = 'i', ylim=c(0, 1))
params <- processres(npb.par)
sims <- 100000
sim.mat <- matrix(NA, ncol=length(x), nrow=sims)
for (i in 1:sims) {
alpha <- rnorm(1, params[1, 1], params[1, 3])
beta <- rnorm(1, params[2, 1], params[2, 3])
sim.mat[i, ] <- pnorm(alpha - beta*x)
}
CI <- apply(sim.mat, 2, function(x) quantile(x, c(0.05, 0.95)))
polygon(c(x, rev(x)), c(CI[1, ], rev(CI[2, ])), col='gray', border=NA)
lines(x, pnorm(params[1, 1] - params[2, 1]*x), lwd=2)
rug(dat100$x)
box()

Related

How do I save the results of this for loop as a vector rather than as a single value?

I am having trouble saving the results of a for loop in the way that I want.
The loop I'm currently running looks like this:
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS)
The loop seems to be working correctly, except for the fact that I would like to end up with 19 values of Bhat_OLS and only currently get 1 value. I want to have a Bhat_OLS value for each value of p in P so that I can plot Bhat_OLS against p.
You can write your results into a data frame with two columns, containing P and Bhat_OLS.
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# initialize result data frame
results <- data.frame(matrix(ncol = 2, nrow = 0,
dimnames = list(NULL, c("P", "Bhat_OLS"))))
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
# insert P and Bhat_OLS into results
results[nrow(results) + 1,] = c(p, Bhat_OLS)
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(results$P, results$Bhat_OLS)
The fact that you loop over the probabilities makes it difficult with the indices. You could loop over seq(P) instead and subset P[i]. Also, at the end you need Bhat_OLS[i]. Then it works.
# Setup objects
n <- 100
R <- (1:1000)
P <- seq(-.9, .9, .1)
betahat_OLS <- rep(NA, length(R))
Bhat_OLS <- rep(NA, length(P))
set.seed(42) ## for sake of reproducibility
# Calculate betahat_OLS for each p in P and each r in R
for (i in seq(P)) {
for (r in R) {
# Simulate data
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- P[i]*v + e
x <- z + v
y <- 0*x + u
#Calculate betahat_OLS
betahat_OLS[r] <- sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS[i] <- sum(betahat_OLS)/1000 - 0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS, xlim=c(-1, 1))
Alternative solution vapply
In a more R-ish way (right now it is more c-ish) you could define the simulation in a function sim() and use vapply for the outer loop. (Actually also for the inner loop, but I've tested it and this way it's faster.)
sim <- \(p, n=100, R=1:1000) {
r <- rep(NA, max(R))
for (i in R) {
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- p*v + e
x <- z + v
y <- 0*x + u
r[i] <- sum(x*y)/sum(x^2)
}
return(sum(r/1000 - 0))
}
set.seed(42)
Bhat_OLS1 <- vapply(seq(-.9, .9, .1), \(p) sim(p), 0)
stopifnot(all.equal(Bhat_OLS, Bhat_OLS1))
Note:
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"

Hist with lines in R

I generate 4 parts of big data: cluster1(10000 points), cluster2(15000 points), cluster3(15000 points) and throws(500 points). Here is the code:
library('MASS')
library('fpc')
#library("dbscan")
library("factoextra")
library("clustertend")
library("boot")
library("stream")
set.seed(123)
mu1<-c(-5,-7)
mu1
sigma1<-matrix(c(4,-2,-2,2), nrow=2, ncol=2, byrow = TRUE)
sigma1
n<-10000
cluster1<-mvrnorm(n,mu1,sigma1)
cluster1
#cluster1<-as.data.frame(cluster1)
#cluster1
#c<-runif(10000,1,1000)
#c
phi <- runif(15000, max = 2*pi)
rho <- sqrt(runif(15000))
x <- sqrt(5)*rho*cos(phi) + 6
y <- sqrt(10/3)*rho*sin(phi) + 4
range(2*(x - 6)^2 + 3*(y - 4)^2)
#[1] 0.001536582 9.999425234
plot(x, y)
cluster2<-cbind(x,y)
cluster2
u <- runif(15000, max = 3)
v <- runif(15000, max = 2)
x <- u + v - 10
y <- v - u + 8
range(x + y)
#[1] -1.999774 1.999826
range(x - y + 15)
#[1] -2.999646 2.999692
plot(x, y)
cluster3<-cbind(x,y)
cluster3
#cluster3<-as.data.frame(cluster1)
#cluster3
x <- runif(500, -20, 20)
y <- runif(500, -20, 20)
#u <- runif(500, max = 20)
#v <- runif(500, max = 20)
#x <- u + v - 20
#y <- v - u
range(x)
range(y)
plot(x,y)
throws<-cbind(x,y)
throws
data<-rbind(cluster1,cluster2,cluster3,throws)
data<-as.data.frame(data)
data
plot(data)
Then I try by using the bootstrap method, construct a distribution of H statistics for some
fixed m, which is from 7% of the total number of generated points(m=2835). Here is th code where I do this:
B<-10#number of iterations
H<-NULL#value of Hopkins statistic
for(i in 1:B){
N<-dim(data)[1]
s<-sample(N,0.8*N)
stat<-hopkins(data[s,], n=2835, byrow = TRUE)$H
H[i]<-stat
#print(c(i, stat))
}
It takes very to generate. Then I should to compare this result with beta distribution - B(m,m). Here is the code:
hist(H)
#(density(H), col="red")
#hist(distB)
X<-seq(min(H), max(H), 0.001)
X
lines(X, dbeta(X,2835,2835), type="l", col="red")
The problem is that lined doesn't draw on hist. Can anybody say what is the problem? Here is the image, I see red line, but it's not exactly right.
Your y-axis values plotted by dbeta() are way too low to register on the supplied y-axis (<0.0000001). You need to overlay the second plot:
# sample data
H <- sample(seq(0.455,0.475,0.001), 1000, replace = TRUE)
#plot histogram
hist(H)
# prepare graphics to add second plot
par(new = TRUE)
# sample data for second plot
X <- seq(0.455,0.475, 0.001)
Y <- dbeta(X,2835,2835)
# plot second plot, remove axes
plot(X, dbeta(X,2835,2835), type="l", col="red", axes = FALSE)
axis(4, Y) # add axis on right side

Rejection sampling in R

I'm doing rejection sampling for the density function 0.5sin(|x|) for -0.5<x<0.5.
These are the functions I created,
f <- function(x){
ifelse((x<0.5 & x>-0.5), 0.5*sin(abs(x)), 0)
}
rf.reject <- function(n) {
M <- 0.24
results <- numeric()
while (length(results) < n) {
X <- runif(2*(n - length(results)), min = -0.5, max = 0.5)
Y <- M*runif(length(X))*1
keep <- Y < f(X)
results <- c(results, X[keep])
}
results[1:n]
}
But when I go to plot the histogram, the actual densities under the pdf f(x) differ from the densities under the histogram.
hist(rf.reject(10000), prob=TRUE, breaks = "Scott" )
curve(f(x), add=TRUE, col="red")
Side note: Candidate function g used for sampling is Unif[-1/2,1/2]

Drawing decision boundaries in R

I've got a series of modelled class labels from the knn function. I've got a data frame with basic numeric training data, and another data frame for test data. How would I go about drawing a decision boundary for the returned values from the knn function? I'll have to replicate my findings on a locked-down machine, so please limit the use of 3rd party libraries if possible.
I only have two class labels, "orange" and "blue". They're plotted on a simple 2D plot with the training data. Again, I just want to draw a boundary around the results from the knn function.
Code:
library(class)
n <- 100
set.seed(1)
x <- round(runif(n, 1, n))
set.seed(2)
y <- round(runif(n, 1, n))
train.df <- data.frame(x, y)
set.seed(1)
x.test <- round(runif(n, 1, n))
set.seed(2)
y.test <- round(runif(n, 1, n))
test.df <- data.frame(x.test, y.test)
k <- knn(train.df, test.df, classes, k=25)
plot(test.df, col=k)
classes is just a vector of class labels determined from an earlier bit of code.
If you need it, below is the complete code for my work:
library(class)
n <- 100
set.seed(1)
x <- round(runif(n, 1, n))
set.seed(2)
y <- round(runif(n, 1, n))
# ============================================================
# Bayes Classifier + Decision Boundary Code
# ============================================================
classes <- "null"
colours <- "null"
for (i in 1:n)
{
# P(C = j | X = x, Y = y) = prob
# "The probability that the class (C) is orange (j) when X is some x, and Y is some y"
# Two predictors that influence classification: x, y
# If x and y are both under 50, there is a 90% chance of being orange (grouping)
# If x and y and both over 50, or if one of them is over 50, grouping is blue
# Algorithm favours whichever grouping has a higher chance of success, then plots using that colour
# When prob (from above) is 50%, the boundary is drawn
percentChance <- 0
if (x[i] < 50 && y[i] < 50)
{
# 95% chance of orange and 5% chance of blue
# Bayes Decision Boundary therefore assigns to orange when x < 50 and y < 50
# "colours" is the Decision Boundary grouping, not the plotted grouping
percentChance <- 95
colours[i] <- "orange"
}
else
{
percentChance <- 10
colours[i] <- "blue"
}
if (round(runif(1, 1, 100)) > percentChance)
{
classes[i] <- "blue"
}
else
{
classes[i] <- "orange"
}
}
boundary.x <- seq(0, 100, by=1)
boundary.y <- 0
for (i in 1:101)
{
if (i > 49)
{
boundary.y[i] <- -10 # just for the sake of visual consistency, real value is 0
}
else
{
boundary.y[i] <- 50
}
}
df <- data.frame(boundary.x, boundary.y)
plot(x, y, col=classes)
lines(df, type="l", lty=2, lwd=2, col="red")
# ============================================================
# K-Nearest neighbour code
# ============================================================
#library(class)
#n <- 100
#set.seed(1)
#x <- round(runif(n, 1, n))
#set.seed(2)
#y <- round(runif(n, 1, n))
train.df <- data.frame(x, y)
set.seed(1)
x.test <- round(runif(n, 1, n))
set.seed(2)
y.test <- round(runif(n, 1, n))
test.df <- data.frame(x.test, y.test)
k <- knn(train.df, test.df, classes, k=25)
plot(test.df, col=k)
Get the class probability predictions on a grid, and draw a contour line at P=0.5 (or whatever you want the cutoff point to be). This is also the method used in the classic MASS textbook by Venables and Ripley, and in Elements of Statistical Learning by Hastie, Tibshirani and Friedman.
# class labels: simple distance from origin
classes <- ifelse(x^2 + y^2 > 60^2, "blue", "orange")
classes.test <- ifelse(x.test^2 + y.test^2 > 60^2, "blue", "orange")
grid <- expand.grid(x=1:100, y=1:100)
classes.grid <- knn(train.df, grid, classes, k=25, prob=TRUE) # note last argument
prob.grid <- attr(classes.grid, "prob")
prob.grid <- ifelse(classes.grid == "blue", prob.grid, 1 - prob.grid)
# plot the boundary
contour(x=1:100, y=1:100, z=matrix(prob.grid, nrow=100), levels=0.5,
col="grey", drawlabels=FALSE, lwd=2)
# add points from test dataset
points(test.df, col=classes.test)
See also basically the same question on CrossValidated.

nls leave one out cv plot in R

I'm trying to do leave one out cross validation of non linear regression and plot the optimal fit. I feel like my loocv and plot functions are totally wrong. Could anybody clarify what I'm doing wrong?
data(Boston, package='MASS')
y <- Boston$nox
x <- Boston$dis
n <- length(x)
nla <- n
las <- seq(0, .85, length=nla)
cvs <- rep(0, nla)
for(j in 1:nla) {
prs <- rep(0,n)
for(i in 1:n) {
yi <- y[-i]
xi <- x[-i]
d <- nls(y~ A + B * exp(C * x), start=list(A=0.5, B=0.5, C=-0.5))
prs[i] <- predict(d, newdata=data.frame(xi=x[i]))
}
cvs[j] <- mean( (y - prs)^2 )
}
cvs[j]
plot(y~x, pch=19, col='gray', cex=1.5,xlab='dis', ylab='nox')
d <- nls(y~ A + B * exp(C * x), start=list(A=0.5, B=0.5, C=-0.5))
lines(predict(d)[order(x)]~sort(x), lwd=4, col='black')
You seem to have been close, but in your loop you were still calling the full set of data x and y. As far as I can tell, you only need a single loop to fit the model to each leave-one-out scenario. Thus, I can't see the need for the variables las nor prs. For reference, the plot shows the leave-one-out mean squared error (LOO MSE) and the mean squared error of the residuals (MSE) for the nls model fit to the full data set.
Script:
require(MASS)
data(Boston, package='MASS')
y <- Boston$nox
x <- Boston$dis
n <- length(x)
cvs <- rep(0, n)
for(j in seq(n)){
ys <- y[-j]
xs <- x[-j]
d <- nls(ys ~ A + B * exp(C * xs), start=list(A=0.5, B=0.5, C=-0.5))
cvs[j] <- (y[j] - predict(d, data.frame(xs=x[j])))^2
print(paste0(j, " of ", n, " finished (", round(j/n*100), "%)"))
}
plot(y~x, pch=19, col='gray', cex=1.5, xlab='dis', ylab='nox')
d <- nls(y~ A + B * exp(C * x), start=list(A=0.5, B=0.5, C=-0.5))
lines(predict(d)[order(x)]~sort(x), lwd=4, col='black')
usr <- par("usr")
text(usr[1] + 0.9*(usr[2]-usr[1]), usr[3] + 0.9*(usr[4]-usr[3]), paste("LOO MSE", "=", round(mean(cvs), 5)), pos=2)
text(usr[1] + 0.9*(usr[2]-usr[1]), usr[3] + 0.8*(usr[4]-usr[3]), paste("MSE", "=", round(mean(resid(d)^2), 5)), pos=2)

Resources