Ratio-of-Uniforms Distribution in R - r

I have an exercise, in which i have to create an algorithm as follows:
ratio of Uniforms is based on the fact that for a random variable X with density f(x) we can generate X from the desired density by calculating X = U/V for a pair (U, V ) uniformly distributed in the set
Af = {(u,v):0 < v ≤ f(u/v)}
Random points can be sampled uniformly in Af by rejection from the min- imal bounding rectangle, i.e., the smallest possible rectangle that contains Af .
It is given by (u−, u+) × (0, v+) where
v+ = max f(x), x
u− = minx f(x), x
u+ = maxx f(x)
Then the Ratio-of-Uniforms method consists of the following simple steps:
Generate random number U uniformly in (u−, u+).
Generate random number V uniformly in (0, v+).
Set X ← U/V .
If V 2 ≤ f(X) accept and return X.
Else try again.
My code so far:
x <- cnorm(1, mean = 0, sd=1)
myrnorm <- function(pdf){
## call rou() n times
pdf <- function(x) {exp(-x^2/2)}
}
rou <- function(u, v) {
uplus <- 1
vplus <- 1
n <- 100
u <- runif(n, min=0, max=uplus)
v <- runif(n, min=0, max=vplus)
xi <- v/u
while(v < sqrt(xi)) {
if(v^2 <= xi)
return(xi)
}
}
myx <- myrnorm(1000)
hist(myx)
But I really dont know how to go on. Im ´lost with this exercise. I would be really grateful for any advise.

Following example 1 in page 8 of this link and your sample code, I came up this solution:
ratioU <- function(nvals)
{
h_x = function(x) exp(-x)
# u- is b-, u+ is b+ and v+ is a in the example:
uminus = 0
uplus = 2/exp(1)
vplus = 1
X.vals <- NULL
i <- 0
repeat {
i <- i+1
u <- runif(1,0,vplus)
v <- runif(1,uminus,uplus)
X <- u/v
if(v^2 <= h_x(X)) {
tmp <- X
}
else {
next
}
X.vals <- c(X.vals,tmp)
if(length(X.vals) >= nvals) break
}
answer <- X.vals
answer
}
sol = ratioU(1000)
par(mfrow=c(1,2))
hist(sol,breaks=50, main= "using ratioU",freq=F)
hist(rexp(1000),breaks = 50, main="using rexp from R",freq=F)
par(mfrow=c(1,1))
par(mfrow=c(1,2))
plot(density(sol))
plot(density(rexp(1000)))
par(mfrow=c(1,1))
A lot of the code may be optimized but I think it is good enough like this for this purpose. I hope this helps.

Related

Area Under the Curve using Simpson's rule in R

I would like to compute the Area Under the Curve defined by a set of experimental values. I created a function to calculate an aproximation of the AUC using the Simpson's rule as I saw in this post. However, the function only works when it receives a vector of odd length. How can I modify the code to add the area of the last trapezoid when the input vector has an even length.
AUC <- function(x, h=1){
# AUC function computes the Area Under the Curve of a time serie using
# the Simpson's Rule (numerical method).
# https://link.springer.com/chapter/10.1007/978-1-4612-4974-0_26
# Arguments
# x: (vector) time serie values
# h: (int) temporal resolution of the time serie. default h=1
n = length(x)-1
xValues = seq(from=1, to=n, by=2)
sum <- list()
for(i in 1:length(xValues)){
n_sub <- xValues[[i]]-1
n <- xValues[[i]]
n_add <- xValues[[i]]+1
v1 <- x[[n_sub+1]]
v2 <- x[[n+1]]
v3 <- x[[n_add+1]]
s <- (h/3)*(v1+4*v2+v3)
sum <- append(sum, s)
}
sum <- unlist(sum)
auc <- sum(sum)
return(auc)
}
Here a data example:
smoothed = c(0.3,0.317,0.379,0.452,0.519,0.573,0.61,0.629,0.628,0.613,0.587,0.556,0.521,
0.485,0.448,0.411,0.363,0.317,0.273,0.227,0.185,0.148,0.12,0.103,0.093,0.086,
0.082,0.079,0.076,0.071,0.066,0.059,0.053,0.051,0.052,0.057,0.067,0.081,0.103,
0.129,0.165,0.209,0.252,0.292,0.328,0.363,0.398,0.431,0.459,0.479,0.491,0.494,
0.488,0.475,0.457,0.43,0.397,0.357,0.316,0.285,0.254,0.227,0.206,0.189,0.181,
0.171,0.157,0.151,0.162,0.192,0.239)
One recommended way to handle an even number of points and still achieve precision is to combine Simpson's 1/3 rule with Simpson's 3/8 rule, which can handle an even number of points. Such approaches can be found in (at least one or perhaps more) engineering textbooks on numerical methods.
However, as a practical matter, you can write a code chunk to check the data length and add a single trapezoid at the end, as was suggested in the last comment of the post to which you linked. I wouldn't assume that it is necessarily as precise as combining Simpson's 1/3 and 3/8 rules, but it is probably reasonable for many applications.
I would double-check my code edits below, but this is the basic idea.
AUC <- function(x, h=1){
# AUC function computes the Area Under the Curve of a time serie using
# the Simpson's Rule (numerical method).
# https://link.springer.com/chapter/10.1007/978-1-4612-4974-0_26
# Arguments
# x: (vector) time serie values
# h: (int) temporal resolution of the time serie. default h=1
#jh edit: check for even data length
#and chop off last data point if even
nn = length(x)
if(length(x) %% 2 == 0){
xlast = x[length(x)]
x = x[-length(x)]
}
n = length(x)-1
xValues = seq(from=1, to=n, by=2)
sum <- list()
for(i in 1:length(xValues)){
n_sub <- xValues[[i]]-1
n <- xValues[[i]]
n_add <- xValues[[i]]+1
v1 <- x[[n_sub+1]]
v2 <- x[[n+1]]
v3 <- x[[n_add+1]]
s <- (h/3)*(v1+4*v2+v3)
sum <- append(sum, s)
}
sum <- unlist(sum)
auc <- sum(sum)
##jh edit: add trapezoid for last two data points to result
if(nn %% 2 == 0){
auc <- auc + (x[length(x)] + xlast)/2 * h
}
return(auc)
}
sm = smoothed[-length(smoothed)]
length(sm)
[1] 70
#even data as an example
AUC(sm)
[1] 20.17633
#original odd data
AUC(smoothed)
[1] 20.389
There may be a good reason for you to prefer using Simpson's rule, but if you're just looking for a quick and efficient estimate of AUC, the trapezoid rule is far easier to implement, and does not require an even number of breaks:
AUC <- function(x, h = 1) sum((x[-1] + x[-length(x)]) / 2 * h)
AUC(smoothed)
#> [1] 20.3945
Here, I show example code that uses the Simpson's 1/3 and 3/8 rules in tandem for the numerical integration of data. As always, the usual caveats about the possibility of coding errors or compatibility issues apply.
The output at the end compares the numerical estimates of this algorithm with the trapezoidal rule using R's "integrate" function.
#Algorithm adapted from:
#Numerical Methods for Engineers, Seventh Edition,
#By Chapra and Canale, page 623
#Modified to accept data instead of functional values
#Modified by: Jeffrey Harkness, M.S.
##Begin Simpson's rule function code
simp13 <- function(dat, h = 1){
ans = 2*h*(dat[1] + 4*dat[2] + dat[3])/6
return(ans)}
simp13m <- function(dat, h = 1){
summ <- dat[1]
n <- length(dat)
nseq <- seq(2,(n-2),2)
for(i in nseq){
summ <- summ + 4*dat[i] + 2*dat[i+1]}
summ <- summ + 4*dat[n-1] + dat[n]
result <- (h*summ)/3
return(result)}
simp38 <- function(dat, h = 1){
ans <- 3*h*(dat[1] + 3*sum(dat[2:3]) + dat[4])/8
return(ans)}
simpson = function(dat, h = 1){
hin = h
len = length(dat)
comp <- len %% 2
##number of segments
if(len == 2){
ans = sum(dat)/2*h} ##n = 2 is the trapezoidal rule
if(len == 3){
ans = simp13(dat, h = hin)}
if(len == 4){
ans = simp38(dat,h = hin)}
if(len == 6){
ans <- simp38(dat[1:4],h = hin) + simp13(dat[4:len],h = hin)}
if(len > 6 & comp == 0){
ans = simp38(dat[1:4],h = hin) + simp13m(dat[4:len],h = hin)}
if(len >= 5 & comp == 1){
ans = simp13m(dat,h = hin)}
return(ans)}
##End Simpson's rule function code
This next section of code shows the performance comparison. This code can easily be altered for different test functions and cases.
The precision difference tends to change with the sample size and test function used; this example is not intended to imply that the difference is always this pronounced.
#other algorithm for comparison purposes, from Allan Cameron above
oa <- function(x, h = 1) sum((x[-1] + x[-length(x)]) / 2 * h)
#Testing and algorithm comparison code
simans = NULL; oaans = NULL; simerr = NULL; oaerr = NULL; mp = NULL
for( j in 1:10){
n = j
#f = function(x) cos(x) + 2 ##Test functions
f = function(x) 0.2 + 25*x - 200*x^2 + 675*x^3 - 900*x^4 + 400*x^5
a = 0;b = 10
h = (b-a)/n
datain = seq(a,b,by = h)
preans = integrate(f,a,b)$value #precise numerical estimate of test function
simans[j] = simpson(f(datain), h = h)
oaans[j] = oa(f(datain), h = h)
(simerr[j] = abs(simans[j] - preans)/preans * 100)
(oaerr[j] = abs(oaans[j] - preans)/preans * 100)
mp[j] = simerr[j] < oaerr[j]
}
(outframe = data.frame("simpsons percent diff" = simerr,"trapezoidal percent diff" = oaerr, "more precise?" = mp, check.names = F))
simpsons percent diff trapezoidal percent diff more precise?
1 214.73489738 214.734897 FALSE
2 15.07958148 64.993410 TRUE
3 6.70203621 29.816799 TRUE
4 0.94247384 16.955208 TRUE
5 0.54830021 10.905620 TRUE
6 0.18616767 7.593825 TRUE
7 0.12051767 5.588209 TRUE
8 0.05890462 4.282980 TRUE
9 0.04087107 3.386525 TRUE
10 0.02412733 2.744500 TRUE

How to fix error in R code because of NaN values?

I have an assignment to estimate parameter $θ$ from a sample with Pareto distribution with density $f(x; θ) = θ/x^(θ + 1), x ≥ 1$, where $θ>0$ is an unknown parameter. However, we do not know the realized sample $x$, we only know for each $x_i$ the given interval $(u_i; v_i)$ in which it is located.
Using the EM algorithm we have to estimate parameter $θ$. Also, EM has to be implemented in R and the code for that is down below.
When I run the code, I have an error because of NaN values. I've tried changing the starting value of the parameter, but NaN values still appear. How to fix this?
set.seed(1)
library(VGAM)
library(ggplot2)
#--------------------------------------------------------------------
# EM algorithm
# step E
expected_xs <- function(teta, u, v) {
teta/(teta-1) *
1/(1/(u^teta)-1/(v^teta))
}
# step M
maximize_logL <- function(xs) {
length(xs)/
sum(log(xs))
}
EM_estimate <- function(teta_0, u, v, tol = 1e-8, maxiter = 1000) {
xs <- expected_xs(teta_0, u, v)
teta <- maximize_logL(xs)
print(teta)
iter <- 1
while(!is.na(teta) && (abs(teta - teta_0) > tol) &&
iter < maxiter) {
iter <- iter + 1
teta_0 <- teta
xs <- expected_xs(teta_0, u, v)
teta <- maximize_logL(xs)
print(teta)
}
return(teta)
}
#--------------------------------------------------------------------
# Data
df <- read.table(header=T, text="
interval freq
1 1-1.5 15
2 1.5-2 5
3 2-2.5 3
4 2.5-3 3
5 3-1000 4")
df
#u <- c(1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5)
#v <- c(1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9,9.5,10)
u <- seq(1, 999.5, by=0.5)
v <- seq(1.5, 1000, by=0.5)
teta1=EM_estimate(0.3, u, v)
teta1
# we compare barplot with density (with its now estimated parameter)
barplot(df$freq, names.arg = df$interval)
curve(100*dpareto(x,teta1,1), add=TRUE, col="steelblue", lwd = 2)
One more thing, when I change teta/(teta-1)to teta/(teta+1) in here:
expected_xs <- function(teta, u, v) {
teta/(teta-1) *
1/(1/(u^teta)-1/(v^teta))
}
everything works normally.

Gauss Newton method R

Find the MLE of the non-linear distribution (in R, using a Gauss-Newton method):
y = sin(x*theta) + epsilon
where epsilon ~ N(0 , 0.01^2)
To do this, I've been asked to generate some data that is uniformly (and randomly) distributed from 0 <= x <= 10 , with n = 200 and theta = 2 (just for generation).
For instance, values that are close to the maximum of the sin function (1, 4 etc.) will converge but others won't.
EDITED
I now understand what theta.iter means but I cannot seem to understand why it converges only sometimes and even then, which values to input to get a useful output of. Can someone explain?
theta <- 2
x <- runif(200, 0, 10)
x <- sort(x) #this is just to sort the generated data so that plotting it
#actually looks like a sine funciton
y <- sin(x*theta) + rnorm(200, mean = 0, sd = 0.1^2)
GN_sin <- function(theta.iter, x , y, epsilon){
index <- TRUE
while (index){
y.iter <- matrix(y - sin(x*theta.iter), 200, 1)
x.iter <- matrix(theta.iter*cos(x*theta.iter), 200, 1)
theta.new <- theta.iter +
solve(t(x.iter)%*%x.iter)%*%t(x.iter)%*%y.iter
if (abs(theta.new-theta.iter) < epsilon) {index <- FALSE}
theta.iter <- as.vector(theta.new)
cat(theta.iter, '\n')
}
}

How to generate a probability density function and expectation in r?

The task:
Eric the fly has a friend, Ernie. Assume that the two flies sit at independent locations, uniformly distributed on the globe’s surface. Let D denote the Euclidean distance between Eric and Ernie (i.e., on a straight line through the interior of the globe).
Make a conjecture about the probability density function of D and give an
estimate of its expected value, E(D).
So far I have made a function to generate two points on the globe's surface, but I am unsure what to do next:
sample3d <- function(2)
{
df <- data.frame()
while(n > 0){
x <- runif(1,-1,1)
y <- runif(1,-1,1)
z <- runif(1,-1,1)
r <- x^2 + y^2 + z^2
if (r < 1){
u <- sqrt(x^2+y^2+z^2)
vector = data.frame(x = x/u,y = y/u, z = z/u)
df <- rbind(vector,df)
n = n- 1
}
}
df
}
E <- sample3d(2)
This is an interesting problem. I'll outline a computational approach; I'll leave the math up to you.
First we fix a random seed for reproducibility.
set.seed(2018);
We sample 10^4 points from the unit sphere surface.
sample3d <- function(n = 100) {
df <- data.frame();
while(n > 0) {
x <- runif(1,-1,1)
y <- runif(1,-1,1)
z <- runif(1,-1,1)
r <- x^2 + y^2 + z^2
if (r < 1) {
u <- sqrt(x^2 + y^2 + z^2)
vector = data.frame(x = x/u,y = y/u, z = z/u)
df <- rbind(vector,df)
n = n- 1
}
}
df
}
df <- sample3d(10^4);
Note that sample3d is not very efficient, but that's a different issue.
We now randomly sample 2 points from df, calculate the Euclidean distance between those two points (using dist), and repeat this procedure N = 10^4 times.
# Sample 2 points randomly from df, repeat N times
N <- 10^4;
dist <- replicate(N, dist(df[sample(1:nrow(df), 2), ]));
As pointed out by #JosephWood, the number N = 10^4 is somewhat arbitrary. We are using a bootstrap to derive the empirical distribution. For N -> infinity one can show that the empirical bootstrap distribution is the same as the (unknown) population distribution (Bootstrap theorem). The error term between empirical and population distribution is of the order 1/sqrt(N), so N = 10^4 should lead to an error around 1%.
We can plot the resulting probability distribution as a histogram:
# Let's plot the distribution
ggplot(data.frame(x = dist), aes(x)) + geom_histogram(bins = 50);
Finally, we can get empirical estimates for the mean and median.
# Mean
mean(dist);
#[1] 1.333021
# Median
median(dist);
#[1] 1.41602
These values are close to the theoretical values:
mean.th = 4/3
median.th = sqrt(2)

fitting a 5th order Bézier Curve to a data set

This is a slightly specific problem, so a bit of knowledge of R and of Bézier curves is required to be of help... (thanks if you do!!)
So I need some help with my R code: I have a series of discretely sampled observations and I am trying to fit a Bézier Curve of the 5th order through these points with simple LSS regression. I have some limitations on the position of the 6 control points:
A & B have the same Y-axis coordinate
B & C have the same X-axis coordinate
C & D have the same Y-axis coordinate
D & E have the same X-axis coordinate
E & F have the same Y-axis coordinate
A is located on the observation 2 turning points ago from the last
observation
The X-axis coordinate of the last observation is
somewhere between the X-axis coordinates of E and F
Like this image:
Say I have these data:
-0.01105
-0.01118
-0.01271
-0.01479
-0.01729
-0.01996
-0.02250
-0.02473
-0.02554
-0.02478
-0.02207
-0.01788
-0.01319
-0.00956
They have a "curvy" shape so a Bézier curve would fit: the result of my code is this image: the data are in red, the 5th order Bézier and its control points with their restrictions in blue:
Like this image:
So you see that I have some kind of solution, but this is the problem:
The X-axis location of right-most control point is always to the right of the last input data point, and to get an appropriate fit, I had to require a value of t (t goes from 0 to 1 in a Bézier) where t is at if the input data end (the "limit" variable in my code). How do I rewrite it so I don't have to do that anymore, and the horizontal spread of the t-values remains constant, also outside of the input data?
(given the restrictions on the control points, and maximizing the fit of the part of the curve that overlaps with the input data)
If you can help, please take a look at this R code, any help is .. much much appreciated and happy holidays!!
ps: what I call exampledata.csv in my code is just the data above.
getT <- function(x){
# Calculates length from origin of each point in the path.
# args:
# x : a one dimensional vector
# Returns:
# out : a vector of distances from the origin, as a percent of end point - start point distance
out <- cumsum(abs(diff(x)))
out <- c(0, out/ out[length(out)])
return(out)
}
cost_f <- function(X,Y,K){
pred <-K%*%X
c <- Y- pred
out <- list(loss= as.vector(t(c)%*%c), pred = pred)
return(out)
}
df <- read.csv('exampledata.csv')
T <- nrow(df)
df['d'] = 1:T
# # identify all turning points:
# turn_point <- c(1)
# for(i in 2:(T-1)){
# if( ( (df[i,'x'] < df[i-1,'x']) & (df[i,'x'] < df[i+1,'x'])) | ( (df[i,'x'] > df[i-1,'x']) & (df[i,'x'] > df[i+1,'x'])) ){
# turn_point <- c(turn_point, i)
# }
# }
fit_last_piece <- function(df){
limit <- .79
turn_point <- c(1)
for(i in 2:(T-1)){
if( ( (df[i,'x'] < df[i-1,'x']) & (df[i,'x'] < df[i+1,'x'])) | ( (df[i,'x'] > df[i-1,'x']) & (df[i,'x'] > df[i+1,'x'])) ){
turn_point <- c(turn_point, i)
}
}
nk <- length(turn_point) # number of turning points
data <- df[turn_point[nk-1]:nrow(df),]
end_x <- data$d[1]
end_y <- data$x[1]
constr_x <- matrix(c(1,0,0,0,0,0, # remember data is input column to column
0,1,1,0,0,0,
0,0,0,1,1,0,
0,0,0,0,0,1),nrow = 6, ncol = 4)
constr_y <- matrix(c(1,1,0,0,0,0,
0,0,1,1,0,0,
0,0,0,0,1,1),nrow = 6, ncol = 3)
M = matrix(c(-1,5,-10,10,-5,1,
5,-20,30,-20,5,0,
-10,30,-30,10,0,0,
10,-20,10,0,0,0,
-5,5,0,0,0,0,
1,0,0,0,0,0),nrow = 6, ncol = 6)
t_x = getT(data$d)*limit
T_x = cbind(t_x^5, t_x^4 ,t_x^3, t_x^2, t_x,rep(1,length(t_x)))
in_par <- ( tail(data$d,1)-data$d[1])*c(2/5,4/5,6/5) + data$d[1] # initial values of the intermediate x levels are at 1/3 and 2/3 midpoints
res_x <- optim(par = in_par, fn = function(par){cost_f(c(data$d[1], par[1],par[2], par[3]), data$d, T_x%*%M%*%constr_x)$loss})
#res_x <- optimize(f = function(par){cost_f(c(df$d[1],par,df$d[nrow(df)]), df$d, T_x%*%M%*%constr_x)$loss}, interval = c(df$d[1],df$d[nrow(df)]),tol = .Machine$double.eps^0.25)
optim_x <- c(data$d[1],res_x$par)
pred_x <- cost_f(optim_x, data$d, T_x%*%M%*%constr_x)$pred
t_y = getT(data$x)*limit
T_y = cbind(t_y^5, t_y^4,t_y^3, t_y^2, t_y,rep(1,length(t_y)))
in_par <- c()
res_y <- optim(par = c(data$x[floor(nrow(data)/2)],tail(data$x,1)), fn = function(par){cost_f(c(data$x[1],par[1],par[2]), data$x, T_y%*%M%*%constr_y)$loss})
optim_y <- c(data$x[1],res_y$par[1],res_y$par[2])
#pred_y <- cost_f(res_y$par, df$x, T_y%*%M%*%constr_y)$pred
pred_y <- cost_f(optim_y, data$x, T_y%*%M%*%constr_y)$pred
t_x_p <- c(t_x,seq(tail(t_x,1),1,length.out = 10))
T_x_p <- cbind(t_x_p^5, t_x_p^4 ,t_x_p^3, t_x_p^2, t_x_p,rep(1,length(t_x_p)))
t_y_p <- c(t_y,seq(tail(t_y,1),1,length.out = 10))
T_y_p <- cbind(t_y_p^5, t_y_p^4 ,t_y_p^3, t_y_p^2, t_y_p,rep(1,length(t_y_p)))
pred_x <- T_x_p%*%M%*%constr_x%*%optim_x
pred_y <- T_y_p%*%M%*%constr_y%*%optim_y
# this part is new:
plot(pred_x,pred_y, ylim = c(min(c(data$x, pred_y,res_y$par)), max(c(data$x, pred_y,res_y$par))),col="blue",type="b")
points(data$d,data$x,col = 'red',type="b")
points(pred_x[1],pred_y[1],pch=20,col='blue')
points(res_x$par[1],pred_y[1],pch=20,col='blue')
points(res_x$par[1],res_y$par[1],pch=20,col='blue')
points(res_x$par[2],res_y$par[1],pch=20,col='blue')
points(res_x$par[2],res_y$par[2],pch=20,col='blue')
points(res_x$par[3],res_y$par[2],pch=20,col='blue')
segments(pred_x[1],pred_y[1],res_x$par[1],pred_y[1],lty=3,col='blue')
segments(res_x$par[1],pred_y[1],res_x$par[1],res_y$par[1],lty=3,col='blue')
segments(res_x$par[1],res_y$par[1],res_x$par[2],res_y$par[1],lty=3,col='blue')
segments(res_x$par[2],res_y$par[1],res_x$par[2],res_y$par[2],lty=3,col='blue')
segments(res_x$par[2],res_y$par[2],res_x$par[3],res_y$par[2],lty=3,col='blue')
}
fit_last_piece(df)

Resources