double for loop in Binomial Tree in R - r

I want set up a model for interest rate in binomial tree. The interest rate is path dependent. I want return interest rate (discount factor and payoff) at every step in all scenarios(2^N). The reason I want to return every single interest rate is that I want use the interest rate is compute discount factor. I know how to do this in a complex way. Here I want to use a double loop (or something simpler) to get the results.
w is for "0" or "1" dummy variable matrix representing all scenarios.
r is interest rate. if there is a head(1), then r1=r0+u=r0+0.005; if there is a tail(0), then r1=r0-d.
D is discount factor. D1=1/(1+r0), D2=D1/(1+r1)...
P is payoff.
In this case, period N is 10. therefore, I can compute step by step. However,if N gets larger and larger, I cannot use my method. I want a simple way to compute this. Thank you.
#Real Price
N <- 10
r0 <- 0.06
K <- 0.05
u <- 0.005
d <- 0.004
q <- 0.5
w <- expand.grid(rep(list(0:1),N))
r <- D <- P <- matrix(0,0,nrow=2^N,ncol=N)
for(i in 1:dim(w)[1])
{
r[i,1] <- r0 + u*w[i,1] - d*(1-w[i,1])
r[i,2] <- r[i,1] + u*w[i,2] - d*(1-w[i,2])
r[i,3] <- r[i,2]+u*w[i,3]-d*(1-w[i,3])
r[i,4] <- r[i,3]+u*w[i,4]-d*(1-w[i,4])
r[i,5] <- r[i,4]+u*w[i,5]-d*(1-w[i,5])
r[i,6] <- r[i,5]+u*w[i,6]-d*(1-w[i,6])
r[i,7] <- r[i,6]+u*w[i,7]-d*(1-w[i,7])
r[i,8] <- r[i,7]+u*w[i,8]-d*(1-w[i,8])
r[i,9] <- r[i,8]+u*w[i,9]-d*(1-w[i,9])
r[i,10] <- r[i,9]*+u*w[i,10]-d*(1-w[i,10])
D[i,1] <- 1/(1+r0)
D[i,2] <- D[i,1]/(1+r[i,1])
D[i,3] <- D[i,2]/(1+r[i,2])
D[i,4] <- D[i,3]/(1+r[i,3])
D[i,5] <- D[i,4]/(1+r[i,4])
D[i,6] <- D[i,5]/(1+r[i,5])
D[i,7] <- D[i,6]/(1+r[i,6])
D[i,8] <- D[i,7]/(1+r[i,7])
D[i,9] <- D[i,8]/(1+r[i,8])
D[i,10] <- D[i,9]/(1+r[i,9])
P[i,1] <- D[i,1]*pmax(K-r0,0)*(0.5^N)
P[i,2] <- D[i,2]*pmax(K-r[i,1],0)*(0.5^N)
P[i,3] <- D[i,3]*pmax(K-r[i,2],0)*(0.5^N)
P[i,4] <- D[i,4]*pmax(K-r[i,3],0)*(0.5^N)
P[i,5] <- D[i,5]*pmax(K-r[i,4],0)*(0.5^N)
P[i,6] <- D[i,6]*pmax(K-r[i,5],0)*(0.5^N)
P[i,7] <- D[i,7]*pmax(K-r[i,6],0)*(0.5^N)
P[i,8] <- D[i,8]*pmax(K-r[i,7],0)*(0.5^N)
P[i,9] <- D[i,9]*pmax(K-r[i,8],0)*(0.5^N)
P[i,10] <- D[i,10]*pmax(K-r[i,9],0)*(0.5^N)
}
true.price <- sum(P)
#> true.price
# > true.price
# [1] 0.00292045

You can just use a nested loop, looping over 2:(ncol(w)) within the i loop:
for(i in 1:nrow(w)) {
r[i, 1] <- r0 + u*w[i, 1] - d*(1-w[i, 1])
D[i, 1] <- 1/(1+r0)
P[i, 1] <- D[i, 1]*pmax(K-r0, 0)*(0.5^N)
for (j in 2:(ncol(w))) {
r[i,j] <- r[i, j-1] + u*w[i, j] - d*(1-w[i, j])
D[i,j] <- D[i, j-1]/(1+r[i, j-1])
P[i,j] <- D[i, j]*pmax(K-r[i, j-1], 0)*(0.5^N)
}
}
true.price <- sum(P)

Related

Bootstrap t confidence interval for censored observations

I have wrote a simulation code for censored observations to find bootstrap-t confidence interval. However, I encountered some problem where my 'btAlpha' and 'btLambda' cannot compute the correct answer hence I cannot go to the next step which is to calculate the total error probabilities.
This is my code :
#BOOTSTRAP-T (20%)
library(survival)
n <- 100
N <- 1000
alpha <- 1
lambda <- 0.5
alphaHat <- NULL
lambdaHat <- NULL
cp <- NULL
btAlpha <- matrix (NA, nrow=N, ncol=2)
btLambda <- matrix (NA, nrow=N, ncol=2)
for (i in 1:1000) {
u <- runif(n)
c1 <- rexp(n, 0.1)
t1 <- -(log(1 - u^(1/alpha))/lambda)
t <- pmin(t1, c1)
ci <- 1*(t1 < c1) #censored data
cp[i] <- length(ci[ci == 0])/n #censoring proportion
#FUNCTION TO CALL OUT
estBoot < -function(data, j) {
dat <- data [j, ]
data0 <- dat[which(dat$ci == 0), ] # right censored data
data1 <- dat[which(dat$ci == 1), ] # uncensored data
dat
#MAXIMUM LIKELIHOOD ESTIMATION
library(maxLik)
LLF <- function(para) {
alpha <- para[1]
lambda <- para[2]
a <- sum(log(alpha*lambda*(1 - exp(-lambda*data1$t1))^(alpha - 1)*
exp(-lambda*data1$t1)))
b <- sum(log(1 - (1 - exp(-lambda*data0$t1)^(alpha))))
l <- a + b
return(l)
}
mle <- maxLik(LLF, start=c(alpha=1, lambda=0.5))
alphaHat <- mle$estimate[1]
lambdaHat <- mle$estimate[2]
observedDi <- solve(-mle$hessian)
return(c(alphaHat, lambdaHat, observedDi[1, 1], observedDi[2, 2]))
}
library(boot)
bt <- boot(dat, estBoot, R=1000)
bootAlphaHat <- bt$t[, 1] #t is from bootstrap
bootAlphaHat0 <- bt$t0[1] #t0 is from original set
seAlphaHat <- sqrt(bt$t[, 2])
seAlphaHat0 <- sqrt(bt$t0[2]) #same as 'original' in bt
zAlpha <- (bootAlphaHat - bootAlphaHat0)/seAlphaHat
kAlpha <- zAlpha[order(zAlpha)]
ciAlpha <- c(kAlpha[25], kAlpha[975])
btAlpha[i, ] <- rev(bootAlphaHat0 - ciAlpha*seAlphaHat0)
bootLambdaHat <- bt$t[, 2]
bootLambdaHat0 <- bt$t0[2]
seLambdaHat <- sqrt(bt$t[, 4])
seLambdaHat0 <- sqrt(bt$t0[4])
zLambda <- (bootLambdaHat - bootLambdaHat0)/seLambdaHat
kLambda <- zLambda[order(zLambda)]
ciLambda <- c(kLambda[25], kLambda[975])
btLambda[i, ] <- rev(bootLambdaHat0 - ciLambda*seLambdaHat0)
}
leftAlpha <- sum(btAlpha[, 1] > alpha)/N
rightAlpha <- sum(btAlpha[, 2] < alpha)/N
totalEAlpha <- leftAlpha + rightAlpha
leftLambda <- sum(btLambda[, 1] > lambda)/N
rightLambda <- sum(btLambda[, 2] < lambda)/N
totalELambda <- leftLambda + rightLambda
#alpha=0.05
sealphaHat <- sqrt(0.05*(1 - 0.05)/N)
antiAlpha <- totalEAlpha > (0.05 + 2.58*sealphaHat)
conAlpha <- totalEAlpha < (0.05 - 2.58*sealphaHat )
asymAlpha <- (max(leftAlpha, rightAlpha)/min(leftAlpha, rightAlpha)) > 1.5
antiLambda <- totalELambda > (0.05 + 2.58 *sealphaHat)
conLambda <- totalELambda < (0.05 - 2.58 *sealphaHat)
asymLambda <- (max(leftLambda, rightLambda)/min(leftLambda, rightLambda)) > 1.5
anti <- antiAlpha + antiLambda
con <- conAlpha + conLambda
asym <- asymAlpha + asymLambda
My 'btAlpha[i,]' and 'btLambda[i,]' is two matrix data frame and only computed NA values hence I cannot calculate the next step which is total error probabilities etc. It should be simulated 1000 values through specified formula but I didnt get the desired output. I have tried to run this without using loops and same problems encountered. Do you guys have any idea? I could really use and truly appreciate your help.

How do I extract all the values of estquant using R?

I don't know how to extract each value of estquant from this loop. What code should I add at the end that gives me all the values instead of just one!
p <- 0.5
m <- 2
d1 <- as.matrix(d);d1
for (i in 1:m){
Xj <- d1[,i]
nj <- length(Xj)
Fj <- pbeta(Fx,i,nj+1-i)
a <- pbeta(p,i,nj+1-i)
estFj <- knots(ecdf(Xj))
estquant <- min(estFj[estFj >= a])
}
You want estquant to be a vector of length m.
So:
p <- 0.5
m <- 2
d1 <- as.matrix(d);d1
estquant <- numeric(m)
for (i in 1:m){
Xj <- d1[,i]
nj <- length(Xj)
Fj <- pbeta(Fx,i,nj+1-i)
a <- pbeta(p,i,nj+1-i)
estFj <- knots(ecdf(Xj))
estquant[i] <- min(estFj[estFj >= a])
}
estquant
(It's important to predefine an object when assigning values to it 1-by-1 in a loop, otherwise R has to redefine the object for every iteration and that is time-consuming.)

How do I convolve() more than two distributions without doubling the result rowcount every time?

I am attempting to convolve() 36 beta distributions. The result distribution is one of the two input distributions to the next successive call to convolve(). After every convolve(), the result has row count = (nrow(vector1)+nrow(vector2)-1). In effect, the row count of the result distribution almost doubles with every call to convolve(). This is very inefficient - it makes runtime impossibly long and consumes large amounts of memory. Is there any way to keep the row count constant?
Code example below ...
# Function from https://stat.ethz.ch/pipermail/r-help/2008-July/168762.html
weighted.var <- function(x, w, na.rm = FALSE) {
if (na.rm) {
w <- w[i <- !is.na(x)]
x <- x[i]
}
sum.w <- sum(w)
sum.w2 <- sum(w^2)
mean.w <- sum(x * w) / sum(w)
(sum.w / (sum.w^2 - sum.w2)) * sum(w * (x - mean.w)^2, na.rm = na.rm);
}
# Define beta distribution shape parameters.
s1a <- 3.52; s1b <- 65.35;
s2a <- 1.684; s2b <- 189.12;
s3a <- 5.696; s3b <- 32.34;
s4a <- 1.81; s4b <- 185.5;
# Define intial set of quantiles.
mQ1 <- matrix(data=seq(0,1,1/1000),ncol=1);
for (i in 1:3){
mPDF <- matrix(data=convolve(dbeta(mQ1,s1a,s1b),rev(dbeta(mQ1,s2a,s2b)),type="open"),ncol=1L);
print(paste(nrow(mPDF),' rows',sep=''));
if(i < 3){
# Calculate the merged shape parameters directly from mPDF.
mQ2 <- matrix(data=seq(0,1L,(1L/(nrow(mPDF)-1L))),ncol=1L);
wtMean <- weighted.mean(mQ2,mPDF);
wtStd <- sqrt(weighted.var(mQ2,mPDF));
s1a <- -1L * ((wtMean*(wtStd^2 + wtMean^2 - wtMean))/wtStd^2);
s1b <- ((wtStd^2 + wtMean^2 - wtMean)*(wtMean - 1))/wtStd^2;
s2a <- s3a; s2b <- s3b;
mQ1 <- mQ2;
}
} #i

Speeding up linear model fitting on complete pairwise observations in large sparse matrix in R

I have a numeric data.frame df with 134946 rows x 1938 columns.
99.82% of the data are NA.
For each pair of (distinct) columns "P1" and "P2", I need to find which rows have non-NA values for both and then do some operations on those rows (linear model).
I wrote a script that does this, but it seems quite slow.
This post seems to discuss a related task, but I can't immediately see if or how it can be adapted to my case.
Borrowing the example from that post:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow=nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
My script is:
tic = proc.time()
out <- do.call(rbind,sapply(1:(N_ps-1), function(i) {
if (i/10 == floor(i/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
do.call(rbind,sapply((i+1):N_ps, function(j) {
w <- which(complete.cases(df[,i],df[,j]))
N <- length(w)
if (N >= 5) {
xw <- df[w,i]
yw <- df[w,j]
if ((diff(range(xw)) != 0) & (diff(range(yw)) != 0)) {
s <- summary(lm(yw~xw))
o <- c(i,j,N,s$adj.r.squared,s$coefficients[2],s$coefficients[4],s$coefficients[8],s$coefficients[1],s$coefficients[3],s$coefficients[7])} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
},simplify=F))
}
,simplify=F))
toc = proc.time();
show(toc-tic);
This takes about 10 minutes on my machine.
You can imagine what happens when I need to handle a much larger (although more sparse) data matrix. I never managed to finish the calculation.
Question: do you think this could be done more efficiently?
The thing is I don't know which operations take more time (subsetting of df, in which case I would remove duplications of that? appending matrix data, in which case I would create a flat vector and then convert it to matrix at the end? ...).
Thanks!
EDIT following up from minem's post
As shown by minem, the speed of this calculation strongly depended on the way linear regression parameters were calculated. Therefore changing that part was the single most important thing to do.
My own further trials showed that: 1) it was essential to use sapply in combination with do.call(rbind, rather than any flat vector, to store the data (I am still not sure why - I might make a separate post about this); 2) on the original matrix I am working on, much more sparse and with a much larger nrows/ncolumns ratio than the one in this example, using the information on the x vector available at the start of each i iteration to reduce the y vector at the start of each j iteration increased the speed by several orders of magnitude, even compared with minem's original script, which was already much better than mine above.
I suppose the advantage comes from filtering out many rows a priori, thus avoiding costly xna & yna operations on very long vectors.
The modified script is the following:
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.90)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x))
dl <- as.list(df)
rl <- sapply(1:(N_ps - 1), function(i) {
if ((i-1)/10 == floor((i-1)/10)) {
cat("\ni = ",i,"\n")
toc = proc.time();
show(toc-tic);
}
x <- dl[[i]]
xna <- which(naIds[[i]])
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]][xna]
yna <- which(naIds[[j]][xna])
w <- xna[yna]
N <- length(w)
if (N >= 5) {
xw <- x[w]
yw <- y[yna]
if ((min(xw) != max(xw)) && (min(yw) != max(yw))) {
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,7))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic)
E.g. try with nr=100000; nc=100.
I should probably mention that I tried using indices, i.e.:
naIds <- lapply(df, function(x) which(!is.na(x)))
and then obviously generating w by intersection:
w <- intersect(xna,yna)
N <- length(w)
This however is slower than the above.
Larges bottleneck is lm function, because there are lot of checks & additional calculations, that you do not necessarily need. So I extracted only the needed parts.
I got this to run in +/- 18 seconds.
set.seed(54321)
nr = 1000;
nc = 900;
dat = matrix(runif(nr*nc), nrow = nr)
rownames(dat) = paste(1:nr)
colnames(dat) = paste("time", 1:nc)
dat[sample(nr*nc, nr*nc*0.9)] = NA
df <- as.data.frame(dat)
df_ps <- names(df)
N_ps <- length(df_ps)
tic = proc.time()
naIds <- lapply(df, function(x) !is.na(x)) # outside loop
dl <- as.list(df) # sub-setting list elements is faster that columns
rl <- sapply(1:(N_ps - 1), function(i) {
x <- dl[[i]]
xna <- naIds[[i]] # relevant logical vector if not empty elements
rl2 <- sapply((i + 1):N_ps, function(j) {
y <- dl[[j]]
yna <- naIds[[j]]
w <- xna & yna
N <- sum(w)
if (N >= 5) {
xw <- x[w]
yw <- y[w]
if ((min(xw) != max(xw)) && (min(xw) != max(xw))) { # faster
# extracts from lm/lm.fit/summary.lm functions
X <- cbind(1L, xw)
m <- .lm.fit(X, yw)
# calculate adj.r.squared
fitted <- yw - m$residuals
rss <- sum(m$residuals^2)
mss <- sum((fitted - mean(fitted))^2)
n <- length(m$residuals)
rdf <- n - m$rank
# rdf <- df.residual
r.squared <- mss/(mss + rss)
adj.r.squared <- 1 - (1 - r.squared) * ((n - 1L)/rdf)
# calculate se & pvals
p1 <- 1L:m$rank
Qr <- m$qr
R <- chol2inv(Qr[p1, p1, drop = FALSE])
resvar <- rss/rdf
se <- sqrt(diag(R) * resvar)
est <- m$coefficients[m$pivot[p1]]
tval <- est/se
pvals <- 2 * pt(abs(tval), rdf, lower.tail = FALSE)
res <- c(m$coefficients[2], se[2], pvals[2],
m$coefficients[1], se[1], pvals[1])
o <- c(i, j, N, adj.r.squared, res)
} else {
o <- c(i,j,N,rep(NA,6))
}
} else {o <- NULL}
return(o)
}, simplify = F)
do.call(rbind, rl2)
}, simplify = F)
out2 <- do.call(rbind, rl)
toc = proc.time();
show(toc - tic);
# user system elapsed
# 17.94 0.11 18.44

for loop storage issue (nested for loops)

I am having difficulty storing all of my data from my middle for loop. when i try and retrieve the data after the outside for loop has run the only data that i am able to attain is the final run. how do i store all of the runs of D in a single matrix?
set.seed(3690)
iterations <- 20
mean_birthrate <- 0.4
stand_dev_birthrate <- 0.1
mean_survival_rate <- 0.68
stand_dev_survival <- 0.07
initial_population <- 100
period <- 20
End_Year <- 2013+period
birthrate <- rnorm(n=1,mean=mean_birthrate,sd=stand_dev_birthrate)
birthrate
survival <- rnorm(n=1,mean=mean_survival_rate,sd=stand_dev_survival)
survival
growth_rate <- birthrate - (1-survival)
growth_rate
for (k in 1:50) {
D <- numeric(period)
D[1] <- initial_population
for (i in 1:period) {
D[i+1] <- D[i] + ((rnorm(n=1,mean=mean_birthrate,sd=stand_dev_birthrate) - (1-rnorm(n=1,mean=mean_survival_rate,sd=stand_dev_survival))) * D[i])
}
print(D)
if (k==1)
plot(D, typ="l",ylab="Number of Bobcats",xlab="Year",ylim=c(50,1700),xaxt='n')
if (k>1)
lines(D,col = rgb(0,0,0,0.1),xaxt='n')
}
if you have nested for loops, you need nested lists (or some other form to capture the n x m many results)
#outter loop
for (i in...)
D[[i]] <- list()
#inner loop
for (j in ...)
D[[i]][[j]] <- value
# or different syntax:
D[[c(i, j)]] <- calue
I had a similar problem, and solved it with a pre-defined variable, like this:
DATA <- list()
j <- 0
for(k in 1:10){
for(i in 1:5){
temp.DATA <- i*k #or whatever the loop does
j <- j+1
DATA[[j]] <- temp.DATA
}
}
DATA2 <- do.call(rbind.data.frame, DATA)
Hope that helps!

Resources