Rejection sampling in R - r

I'm doing rejection sampling for the density function 0.5sin(|x|) for -0.5<x<0.5.
These are the functions I created,
f <- function(x){
ifelse((x<0.5 & x>-0.5), 0.5*sin(abs(x)), 0)
}
rf.reject <- function(n) {
M <- 0.24
results <- numeric()
while (length(results) < n) {
X <- runif(2*(n - length(results)), min = -0.5, max = 0.5)
Y <- M*runif(length(X))*1
keep <- Y < f(X)
results <- c(results, X[keep])
}
results[1:n]
}
But when I go to plot the histogram, the actual densities under the pdf f(x) differ from the densities under the histogram.
hist(rf.reject(10000), prob=TRUE, breaks = "Scott" )
curve(f(x), add=TRUE, col="red")
Side note: Candidate function g used for sampling is Unif[-1/2,1/2]

Related

How do I save the results of this for loop as a vector rather than as a single value?

I am having trouble saving the results of a for loop in the way that I want.
The loop I'm currently running looks like this:
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS)
The loop seems to be working correctly, except for the fact that I would like to end up with 19 values of Bhat_OLS and only currently get 1 value. I want to have a Bhat_OLS value for each value of p in P so that I can plot Bhat_OLS against p.
You can write your results into a data frame with two columns, containing P and Bhat_OLS.
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# initialize result data frame
results <- data.frame(matrix(ncol = 2, nrow = 0,
dimnames = list(NULL, c("P", "Bhat_OLS"))))
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
# insert P and Bhat_OLS into results
results[nrow(results) + 1,] = c(p, Bhat_OLS)
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(results$P, results$Bhat_OLS)
The fact that you loop over the probabilities makes it difficult with the indices. You could loop over seq(P) instead and subset P[i]. Also, at the end you need Bhat_OLS[i]. Then it works.
# Setup objects
n <- 100
R <- (1:1000)
P <- seq(-.9, .9, .1)
betahat_OLS <- rep(NA, length(R))
Bhat_OLS <- rep(NA, length(P))
set.seed(42) ## for sake of reproducibility
# Calculate betahat_OLS for each p in P and each r in R
for (i in seq(P)) {
for (r in R) {
# Simulate data
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- P[i]*v + e
x <- z + v
y <- 0*x + u
#Calculate betahat_OLS
betahat_OLS[r] <- sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS[i] <- sum(betahat_OLS)/1000 - 0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS, xlim=c(-1, 1))
Alternative solution vapply
In a more R-ish way (right now it is more c-ish) you could define the simulation in a function sim() and use vapply for the outer loop. (Actually also for the inner loop, but I've tested it and this way it's faster.)
sim <- \(p, n=100, R=1:1000) {
r <- rep(NA, max(R))
for (i in R) {
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- p*v + e
x <- z + v
y <- 0*x + u
r[i] <- sum(x*y)/sum(x^2)
}
return(sum(r/1000 - 0))
}
set.seed(42)
Bhat_OLS1 <- vapply(seq(-.9, .9, .1), \(p) sim(p), 0)
stopifnot(all.equal(Bhat_OLS, Bhat_OLS1))
Note:
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"

Hist with lines in R

I generate 4 parts of big data: cluster1(10000 points), cluster2(15000 points), cluster3(15000 points) and throws(500 points). Here is the code:
library('MASS')
library('fpc')
#library("dbscan")
library("factoextra")
library("clustertend")
library("boot")
library("stream")
set.seed(123)
mu1<-c(-5,-7)
mu1
sigma1<-matrix(c(4,-2,-2,2), nrow=2, ncol=2, byrow = TRUE)
sigma1
n<-10000
cluster1<-mvrnorm(n,mu1,sigma1)
cluster1
#cluster1<-as.data.frame(cluster1)
#cluster1
#c<-runif(10000,1,1000)
#c
phi <- runif(15000, max = 2*pi)
rho <- sqrt(runif(15000))
x <- sqrt(5)*rho*cos(phi) + 6
y <- sqrt(10/3)*rho*sin(phi) + 4
range(2*(x - 6)^2 + 3*(y - 4)^2)
#[1] 0.001536582 9.999425234
plot(x, y)
cluster2<-cbind(x,y)
cluster2
u <- runif(15000, max = 3)
v <- runif(15000, max = 2)
x <- u + v - 10
y <- v - u + 8
range(x + y)
#[1] -1.999774 1.999826
range(x - y + 15)
#[1] -2.999646 2.999692
plot(x, y)
cluster3<-cbind(x,y)
cluster3
#cluster3<-as.data.frame(cluster1)
#cluster3
x <- runif(500, -20, 20)
y <- runif(500, -20, 20)
#u <- runif(500, max = 20)
#v <- runif(500, max = 20)
#x <- u + v - 20
#y <- v - u
range(x)
range(y)
plot(x,y)
throws<-cbind(x,y)
throws
data<-rbind(cluster1,cluster2,cluster3,throws)
data<-as.data.frame(data)
data
plot(data)
Then I try by using the bootstrap method, construct a distribution of H statistics for some
fixed m, which is from 7% of the total number of generated points(m=2835). Here is th code where I do this:
B<-10#number of iterations
H<-NULL#value of Hopkins statistic
for(i in 1:B){
N<-dim(data)[1]
s<-sample(N,0.8*N)
stat<-hopkins(data[s,], n=2835, byrow = TRUE)$H
H[i]<-stat
#print(c(i, stat))
}
It takes very to generate. Then I should to compare this result with beta distribution - B(m,m). Here is the code:
hist(H)
#(density(H), col="red")
#hist(distB)
X<-seq(min(H), max(H), 0.001)
X
lines(X, dbeta(X,2835,2835), type="l", col="red")
The problem is that lined doesn't draw on hist. Can anybody say what is the problem? Here is the image, I see red line, but it's not exactly right.
Your y-axis values plotted by dbeta() are way too low to register on the supplied y-axis (<0.0000001). You need to overlay the second plot:
# sample data
H <- sample(seq(0.455,0.475,0.001), 1000, replace = TRUE)
#plot histogram
hist(H)
# prepare graphics to add second plot
par(new = TRUE)
# sample data for second plot
X <- seq(0.455,0.475, 0.001)
Y <- dbeta(X,2835,2835)
# plot second plot, remove axes
plot(X, dbeta(X,2835,2835), type="l", col="red", axes = FALSE)
axis(4, Y) # add axis on right side

using grid.arrange with loop holding multiple plots

I am trying to create a plot where for each i there is a density graph and a histogram side by side. For this instance i = 1..3
The problem I have is creating the list to pass to grid.arrange. However I do it it seems to repeat itself somehow.
df:
x1 x2 x3
1 108.28 17.05 1484.10
2 152.36 16.59 750.33
3 95.04 10.91 766.42
4 65.45 14.14 1110.46
5 62.97 9.52 1031.29
6 263.99 25.33 195.26
7 265.19 18.54 193.83
8 285.06 15.73 191.11
9 92.01 8.10 1175.16
10 165.68 11.13 211.15
X <- df
mu.X <- colMeans(X)
cov.X <- cov(X)
eg <- eigen(cov.X)
myprinboot = function(
X,
iter = 10000,
alpha = 0.05,
prettyPlot = T
){
# Find the dimensions of X
nrX <- dim(X)[1]
nx <- dim(X)[2]
# Make matrices of suitable sizes to hold the booted parameter estimates
# lambda
# each cov matrix will have nx lambdas
lambda.mat <- matrix(NA, nr = nx, nc = iter)
# e vectors nx components each and one vector per eigen value
# Each cov matrix will therefore produce a nx X nx matrix of components
Y.mat <- matrix(NA, nr = nx, nc = iter * nx)
# For loop to fill the matrices created above
for (i in 1:iter)
{
# ind will contain random integers used to make random samples of the X matrix
# Must use number of rows nrX to index
ind <- sample(1:nrX,nrX,replace=TRUE)
# eigen will produce lambdas in decreasing order of size
# make an object then remove extract the list entries using $
eigvalvec <- eigen(cov(X[ind,]))
lambda.mat[,i] <- eigvalvec$values
colstart <- 1 + nx * (i - 1)
colend <- colstart + nx - 1
Y.mat[,colstart:colend] = eigvalvec$vectors
}
if(prettyPlot){
p <- list()
i <- 0
for(j in 1:(2*nx))
{
if (j %% 2 == 0){
p[[j]] <- ggplot(NULL, aes(lambda.mat[i,])) +
geom_histogram(color = 'black', fill = 'green', alpha = .5) +
xlab(substitute(lambda[i])) +
ggtitle(substitute(paste("Histogram of the pc variance ", lambda[i])))
} else {
i <- i + 1
p[[j]] <- ggplot(NULL, aes(lambda.mat[i,])) +
geom_density(fill = 'blue', alpha = .5) +
xlab((substitute(lambda[i]))) +
ggtitle(substitute(paste("Density plot of the pc variance ", lambda[i])))
}
do.call(grid.arrange, p)
}
do.call(grid.arrange, p)
} else {
layout(matrix(1:(2*nx),nr=nx,nc=2,byrow=TRUE))
for(i in 1:nx)
{
plot(density(lambda.mat[i,]),xlab=substitute(lambda[i]),
main=substitute(paste("Density plot of the pc variance ", lambda[i])
))
hist(lambda.mat[i,],xlab=substitute(lambda[i]),
main=substitute(paste("Histogram of the pc variance ", lambda[i])))
}
}
library(rgl)
plot3d(t(lambda.mat))
list(lambda.mat = lambda.mat, Y.mat = Y.mat)
}
pc <- myprinboot(X = Y, iter=1000, alpha=0.5)
Output
Anyone have any clue what I am doing wrong or is this just not possible?
I don't understand your code, Jay, as it seems to do lots of things and use both base and ggplot plotting, but if all you want is to create a combined histogram and density plot for each j, why not loop over j and inside that for j loop do something like this:
d <- your density plot created so that it depends on j only
h <- your histogram plot created so that it depends on j only
p[[j]] <- grid.arrange(d,h,ncol=2)
Then, when you come out of the loop, you'll have an object p which consists of a list of plots, with each plot consisting of a combination of density plot and histogram.
Then you could use the cowplot package (after installing it) to do something like this:
cowplot::plot_grid(plotlist = p, ncol = 2)
where the number of columns may need to be changed. See here for other ways to plot a list of plots: How do I arrange a variable list of plots using grid.arrange?
I don't know enough about your problem to understand why you treat the case of j even and j odd differently. But the underlying idea should be the same as what I suggested here.
I eventually got this working as follows.
getHist <- function(x, i){
lam <- paste('$\\lambda_', i, '$', sep='')
p <- qplot(x[i,],
geom="histogram",
fill = I('green'),
color = I('black'),
alpha = I(.5),
main=TeX(paste("Histogram of the pc variance ", lam, sep='')),
xlab=TeX(lam),
ylab="Count",
show.legend=F)
return(p)
}
getDens <- function(x, i){
lam <- paste('$\\lambda_', i, '$', sep='')
p <- qplot(x[i,],
geom="density",
fill = I('blue'),
alpha = I(.5),
main=TeX(paste("Density plot of the pc variance ", lam, sep='')),
xlab=TeX(lam),
ylab="Density",
show.legend=F)
return(p)
}
fp <- lapply(1:3, function(x) arrangeGrob(getHist(lambda.mat, x), getDens(lambda.mat, x), ncol=2))
print(marrangeGrob(fp, nrow = 3, ncol=1, top = textGrob("Lambda.mat Histogram and Density Plot",gp=gpar(fontsize=18))))

Plotting the CDF and Quantile Functions Given the PDF

How would I plot the CDF and Quantile functions, in R, if I have the PDF. Currently, I have the following (but I think there must be a better way to do it):
## Probability Density Function
p <- function(x) {
result <- (x^2)/9
result[x < 0 | x > 3] <- 0
result
}
plot(p, xlim = c(0,3), main="Probability Density Function")
## Cumulative Distribution Function
F <- function(a = 0,b){
result <- ((b^3)/27) - ((a^3)/27)
result[a < 0 ] <- 0
result[b > 3] <- 1
result
}
plot(F(,x), xlim=c(0,3), main="Cumulative Distribution Function")
## Quantile Function
Finv <- function(p) {
3*x^(1/3)
}
As #dash2 suggested, the CDF would need you to integrate the PDF, in essence needing you to find the area under the curve.
Here's a generic solution which should help. I am using a gaussian distribution as an example - you should be able to feed to it any generic function.
Note that quantiles reported are approximations only. Also, dont forget to look into the documentation for integrate().
# CDF Function
CDF <- function(FUNC = p, plot = T, area = 0.5, LOWER = -10, UPPER = 10, SIZE = 1000){
# Create data
x <- seq(LOWER, UPPER, length.out = SIZE)
y <- p(x)
area.vec <- c()
area.vec[1] <- 0
for(i in 2:length(x)){
x.vec <- x[1:i]
y.vec <- y[1:i]
area.vec[i] = integrate(p, lower = x[1], upper = x[i])$value
}
# Quantile
quantile = x[which.min(abs(area.vec - area))]
# Plot if requested
if(plot == TRUE){
# PDF
par(mfrow = c(1, 2))
plot(x, y, type = "l", main = "PDF", col = "indianred", lwd = 2)
grid()
# CDF
plot(x, area.vec, type = "l", main = "CDF", col = "slateblue",
xlab = "X", ylab = "CDF", lwd = 2)
# Quantile
mtext(text = paste("Quantile at ", area, "=",
round(quantile, 3)), side = 3)
grid()
par(mfrow = c(1, 1))
}
}
# Sample data
# PDF Function - Gaussian distribution
p <- function(x, SD = 1, MU = 0){
y <- (1/(SD * sqrt(2*pi)) * exp(-0.5 * ((x - MU)/SD) ^ 2))
return(y)
}
# Call to function
CDF(p, area = 0.5, LOWER = -5, UPPER = 5)

Adding confidence intervals to plot from simulation data in R

I've created a probit simulation based on a likelihood function and simulation, all of which can be replicated with the code below.
This is the likelihood function:
probit.ll <- function(par,ytilde,x) {
a <- par[1]
b <- par[2]
return( -sum( pnorm(ytilde*(a + b*x),log=TRUE) ))
}
This is the function to do the estimates:
my.probit <- function(y,x) {
# use OLS to get start values
par <- lm(y~x)$coefficients
ytilde <- 2*y-1
# Run optim
res <- optim(par,probit.ll,hessian=TRUE,ytilde=ytilde,x=x)
# Return point estimates and SE based on the inverse of Hessian
names(res$par) <- c('a','b')
se=sqrt(diag(solve(res$hessian)))
names(se) <- c('a','b')
return(list(par=res$par,se=se,cov=solve(res$hessian)))
}
And this is the function to generate the simulated model:
probit.data <- function(N=100,a=1,b=1) {
x <- rnorm(N)
y.star <- a + b*x + rnorm(N)
y <- (y.star > 0)
return( as.data.frame(cbind(y,x,y.star)) )
}
This simulates an n size equal 100:
probit.data100 <- function(N=100,a=2,b=1) {
x <- rnorm(N)
y.star <- a + b*x + rnorm(N)
y <- (y.star > 0)
return( as.data.frame(cbind(y,x,y.star)) )
}
#predicted value
se.probit.phat100 <- function(x, par, V) {
z <- par[1] + par[2] * x
# Derivative of q w.r.t. alpha and beta
J <- c( dnorm(z), dnorm(z)*par[2] )
return( sqrt(t(J) %*% V %*% J) )
}
dat100 <- probit.data100()
res100 <- my.probit(dat100$y,dat100$x)
res100
This function below will calculate the confidence intervals based on a non-parametric bootstrap approach (notice the sample function being used):
N <- dim(probit.data(N=100, a=1, b=1))[1]
npb.par <- matrix(NA,100,2)
colnames(npb.par) <- c("alpha","beta")
npb.eystar <- matrix(NA,100,N)
for (t in 1:100) {
thisdta <- probit.data(N=100, a=1, b=1)[sample(1:N,N,replace=TRUE),]
npb.par[t,] <- my.probit(thisdta$y,thisdta$x)$par
}
This function below just cleans up the bootstrap output, and the confidence intervals are what I would like to plot:
processres <- function(simres) {
z <- t(apply(simres,2,function(x) { c(mean(x),median(x),sd(x),quantile(x,c(0.05,0.95))) } ))
rownames(z) <- colnames(simres)
colnames(z) <- c("mean","median","sd","5%","95%")
z
}
processres(npb.par)
I would like to plot a graph like this (the one below), but add confidence intervals based on the processres function above. How can these confidence intervals be added to the plot?
x <- seq(-5,5,length=100)
plot(x, pnorm(1 - 0.5*x), ty='l', lwd=2, bty='n', xlab='x', ylab="Pr(y=1)")
rug(dat100$x)
I'm also open to a different plot code and/or package. I just want a graph based on this simulation with added confidence intervals.
Thanks!
Here's a way to add a shaded CI based on simulation results:
UPDATE: this now plots the expected curve (i.e. using mean alpha & beta values), and correctly passes these means to rnorm.
x <- seq(-5,5,length=100)
plot(x, pnorm(1 - 0.5*x), ty='n', lwd=2, bty='n', xlab='x', ylab="Pr(y=1)",
xaxs = 'i', ylim=c(0, 1))
params <- processres(npb.par)
sims <- 100000
sim.mat <- matrix(NA, ncol=length(x), nrow=sims)
for (i in 1:sims) {
alpha <- rnorm(1, params[1, 1], params[1, 3])
beta <- rnorm(1, params[2, 1], params[2, 3])
sim.mat[i, ] <- pnorm(alpha - beta*x)
}
CI <- apply(sim.mat, 2, function(x) quantile(x, c(0.05, 0.95)))
polygon(c(x, rev(x)), c(CI[1, ], rev(CI[2, ])), col='gray', border=NA)
lines(x, pnorm(params[1, 1] - params[2, 1]*x), lwd=2)
rug(dat100$x)
box()

Resources