Extracting row and column in R - r

I have to write a one sample proportion Z test function in R. I need to have the sample proportion be the proportion of data in the first factor level.
For example,
data <- factor(c(NA, rep("a", 60), rep("b", 40)))
table(data)
a b
60 40
And I need the sample proportion to be 60/100. Here is portion of my code and it is returning an error saying unexpected symbol in mtab <- addmargins(table(data)).
hyp_test <- function(data, hyp_val=NULL, alpha, alternative="two-sided",graph=FALSE) {
n <- sum(!is.na(data))
ifelse(is.factor(data),
mtab <- addmargins(table(data))
phat <- mtab[1]/mtab[3]
qhat <- 1 - phat
if(length(hyp_val) > 0) {
q <- 1-hyp_val
SE.phat <- sqrt((hyp_val*q)/n)
ts.z <- (phat - hyp_val)/SE.phat
p.val <- pnorm(ts.z)*2
if(alternative=="less") {
p.val <- pnorm(ts.z)
}
if(alternative=="greater") {
p.val <- 1 - p.val
}
}
Any help would be much appreciated. I need to basically find out how to find the sample proportion.

In addition to what r2evans states, you should review if statements and pnorm. This is a guesstimate of what you are trying to accomplish since the code is cut off.
hyp_test <- function(data, hyp_val=NULL, alpha, alternative="two-sided",graph=FALSE) {
n <- sum(!is.na(data))
mtab <- addmargins(table(data))
phat <- mtab[1]/mtab[3]
qhat <- 1 - phat
q <- 1-hyp_val
SE.phat <- sqrt((hyp_val*q)/n)
ts.z <- (phat - hyp_val)/SE.phat
p.val <- ifelse(alternative=="two-sided", dnorm(ts.z)*2,ifelse(alternative=="less",1-dnorm(ts.z), dnorm(ts.z)))
if(graph==TRUE) {plot(...)}
return(p.val)
}

Related

How to perform a bootstrap and find confidence interval in R

I want to create a custom bootstrap function because I want to better understand what bootstrap is doing and it seems like the other bootstrap libraries out there does not solve my issue.
The Problem: I would like to create my own wald confidence interval function where it takes in the bootstrap data, outputs the confidence interval, test the confidence interval is within a range, and gets the coverage.
Right now, I am getting this type of error:
Error in bootresults[i,}<-waldCI(y=bootdata[i], n=numTrials):number of
items to replace is not a multiple of replacement length
The goal: My goal is to get the bootresults dataset to return 4 columns(p value,One that shows the upper bound, lower bound, and whether or not the p is in the interval) and get a graph similar to this one:
Wald interval chart
Code:
set.seed(42)
samples10 <- list()
i <- 1
while(i < 100) {
sample10[[i]] <- rbinom(1500, size=10, prob=i*.01) ## rows=1500 ;columns=10
i <- i + 1
}
sample10 <- data.frame(samples10)
colnames(sample10) <- c(seq(.01, .99, .01)) ## p-values
waldconfidenceinterval <- function(y, n, alpha=0.05) {
p <- colSums(y)/(n*200)
sd <- sqrt(p*((1 - p)/(n*200)))
z <- qnorm(c(alpha/2, 1 - alpha/2))
ci <- p + z*sd
return(ci)
}
B <- 200
numTrials <- 10
bootresults <- matrix(ncol=length(sample10), nrow=B) ## rows=200, cols=99
## empty matrix in the beginning
set.seed(42)
for(i in seq_len(B)) {
bootdata <- sample10[sample(B, replace=T), ]
bootresults[i, ] <- waldCI(y=bootdata[i], n=numTrials)
## Pseudocode:
# boot_test_data$in_interval <-
# ifelse(boot_test_data$lower1 < i/100 & i/100 < boot_test_data$upper1, 1, 0)
# coverage[i] <- sum(boot_test_data$in_interval) / length(boot_test_data$in_interval)
}
Any help is greatly appreciated since I am fairly new to R.
Looks like that you want to initialize a three-dimensional array bootresults rather than a two-dimensional matrix. In your waldCI() you may use colMeans.
waldCI <- function(y, alpha=0.05) {
p <- colMeans(y)
se <- sqrt(p*(1 - p)/nrow(y))
z <- qnorm(1 - alpha/2)
ci <- p + z*se %*% cbind(lower=-1, upper=1)
return(ci)
}
B <- 200
numTrials <- 10
## initialize array
bootresults1 <- array(dim=c(ncol(samples10), 4, B),
dimnames=list(c(), c("p.values", "lower", "upper", "in.int"), c()))
set.seed(42)
for(i in seq_len(B)) {
samp <- samples10[sample(nrow(samples10), numTrials, replace=F), ]
ci <- waldCI(samp)
bootresults1[,,i] <- cbind(p.values, ci, in.int=ci[, 1] < p.values & p.values < ci[, 2])
}
coverage <- rowMeans(bootresults[,4,])
plot(p.values, coverage, type="l", main="My Plot")
Similar approach, more R-ish, though:
p.values <- seq(.01, .99, .01)
set.seed(42)
samples10 <- `colnames<-`(sapply(p.values, function(pr) rbinom(1.5e3, 1, pr)), p.values)
BOOT <- function(numTrials, ...) {
samp <- samples10[sample(nrow(samples10), numTrials, replace=F), ]
ci <- waldCI(samp, ...)
cbind(p.values, ci, in.int=ci[, 1] < p.values & p.values < ci[, 2])
}
B <- 200
numTrials <- 10
set.seed(42)
bootresults2 <- replicate(B, BOOT(numTrials=10))
stopifnot(all.equal(bootresults1, bootresults2))
Data:
Note, that I used rbinom(..., size=1, ...) to create your sample data. The use of "p" as an object name suggested that the data should be binomial.
set.seed(42)
samples10 <- matrix(nrow=1500, ncol=99, dimnames=list(c(), c(seq(.01, .99, .01))))
i <- 1
while (i < 100) {
samples10[, i] <- rbinom(1500, size=1, prob=i*.01) ## rows=1500 ;columns=10
i <- i + 1
}
Without a while loop, you could proceed vectorized:
p.values <- seq(.01, .99, .01)
set.seed(42)
samples10 <- `colnames<-`(sapply(p.values, function(pr) rbinom(1.5e3, 1, pr)), p.values)

Increasing weighted mean by random sample

I have written code in order to randomly add points to a numeric variable to increase the weighted mean score by 10% storing the new scores in variable S1.
This is done by calculating the total number of points that need to be added to increase the mean by 10%. Next step is to randomly select until the weighted sum of responses is equal to the target - but not adding points where the score is already 10 so as not to pass the maximum value on the scale. The final stage is to select whether the sum that is just above or just below the target is closest and select this sample to add points to.
The code works ok but doesn't look efficient. I am an R novice and have read that loops should be avoided as much as possible, but cannot work out an alternative. Is it possible to do what I am attempting, but more efficiently?
#Create random data
library(stats)
set.seed(21821)
ncust <- 1000
cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
wtvar <- rnorm(ncust, mean=1, sd=0.2)
V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
V1[V1 > 10] <- 10
V1[V1 < 1] <- 1
cust.df$V1 <- V1
cust.df$wtvar <- wtvar
#Function to determine sample required
random.sample <- function(x) {
(pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
(numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase
wgttot <- vector(mode="numeric", length=0)
idtot <- vector(mode="numeric", length=0)
id.ref <- cust.df$cust.id[!cust.df$V1==10]
repeat {
preidtot <- idtot
prewgttot <- wgttot
(t.id <- as.numeric(sample(id.ref, 1)))
(t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
id.ref <- id.ref[!id.ref==t.id]
wgttot <- c(wgttot,t.wgt)
idtot <- c(idtot, t.id)
if (sum(wgttot) > numadd) break
}
prediff <- numadd - sum(prewgttot)
postdiff <- sum(wgttot) - numadd
if (prediff < postdiff) {
x <- preidtot
} else {
x <- idtot
}
return(x)
}
tempids <- random.sample()
#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)
#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)
The random.sample is your first version, random.sample1 is the without-loop version, random.sample1 do similar thing as random.sample, but their results are different. You can check the code to see how the result of random.sample1 is used. And due to fact that from your definition, the samples required are not unique, so the results of weighted sum are also different, but all approximately increase by 10%.
#Create random data
library(stats)
set.seed(21821)
ncust <- 1000
cust.df <- data.frame(cust.id=as.factor(c(1:ncust)))
wtvar <- rnorm(ncust, mean=1, sd=0.2)
V1 <- floor(rnorm(ncust, mean=7.5, sd=3))
V1[V1 > 10] <- 10
V1[V1 < 1] <- 1
cust.df$V1 <- V1
cust.df$wtvar <- wtvar
#Function to determine sample required
random.sample <- function() {
(pctadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)/sum(cust.df$V1[!cust.df$V1==10]*cust.df$wtvar[!cust.df$V1==10])) #percentage of resps (weighted) who need points added to make 10% increase
(numadd <- (sum(cust.df$V1*cust.df$wtvar)*0.1)) #sum of weights needed to make 10% increase
wgttot <- vector(mode="numeric", length=0)
idtot <- vector(mode="numeric", length=0)
id.ref <- cust.df$cust.id[!cust.df$V1==10]
repeat {
preidtot <- idtot
prewgttot <- wgttot
(t.id <- as.numeric(sample(id.ref, 1)))
(t.wgt <- cust.df$wtvar[cust.df$cust.id==t.id])
id.ref <- id.ref[!id.ref==t.id]
wgttot <- c(wgttot,t.wgt)
idtot <- c(idtot, t.id)
if (sum(wgttot) > numadd) break
}
prediff <- numadd - sum(prewgttot)
postdiff <- sum(wgttot) - numadd
if (prediff < postdiff) {
x <- preidtot
} else {
x <- idtot
}
return(x)
}
random.sample1 <- function() {
numadd <- sum(cust.df$V1 * cust.df$wtvar) * 0.1 #sum of weights needed to make 10% increase
id.ref <- which(cust.df$V1 != 10)
pos <- sample(id.ref, length(id.ref))
t.wgt <- cust.df$wtvar[pos]
sumwgttot <- cumsum(t.wgt)
return(pos[1:which.min(abs(sumwgttot - numadd))])
}
system.time(tempids <- random.sample())
## On my computer, it uses about 0.200s to finish the calculation.
system.time(tempids1 <- random.sample1())
## On my computer, the without loop version uses about 0.000s.
#Apply sample rule
cust.df$S1 = ifelse(cust.df$cust.id %in% tempids, cust.df$V1 + 1, cust.df$V1)
## Note that the usage of tempids1 is different, this usage is more
## effective than the original one.
cust.df$S2 = cust.df$V1
cust.df$S2[tempids1] = cust.df$V1[tempids1] + 1
#Check ~10% increase achieved
weighted.mean(cust.df$V1,cust.df$wtvar)
weighted.mean(cust.df$S1,cust.df$wtvar)
weighted.mean(cust.df$S2,cust.df$wtvar)

User defined function input to loop every row of data frame

Im trying to create my very own first project in R but have hit a roadblock.
I have a data frame such as below where every row represents dataset of a financial option.
type <- c("C", "C")
marketV <- c(1.1166, 1.911)
S <- c(20, 60)
K <- c(20, 56)
T <- c(0.333, 0.5)
df <- data.frame(type, marketV, S, K, T)
I made a user defined function to take this data frame as an input and works great when the data frame is one row long. However, I'm not sure how to have my function iterate through all the data frame rows and produce a result for all of them.
I'm new to R so I'm unsure whether I should be running a 'for' loop around or playing around with lapply, or if theres a simple syntax answer. I simply want the function to take the df as input, but repeat its calculation for n row, and produce n results. Thank you for the help in advance.
My current function code for a df with 1 row below as reference:
This is a corrected version of your program:
df <- data.frame(type=c("C", "C"), marketV=c(1.1166, 1.911), S=c(20, 60), K=c(20, 56), T=c(0.333, 0.5))
IV <- function(df) {
# check if df has more then 1 row:
if (nrow(df)>1) { message("!! nrow(df)>1 !!"); return(NA) }
# Initializing of variables
r <- 0
sigma <- 0.3
sigma_down <- 0.001
sigma_up <- 1
count <- 0
type <- df$type; marketV <- df$marketV; S <- df$S; K <- df$K; T <- df$T
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- (log(S/K) - (sigma^2/2)*T)/(sigma*sqrt(T))
if(type=="C") {
V <- exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2))
} else {
V <- exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) }
difference <- V - marketV
# Root finding of sigma by Bisection method
while(abs(difference)>0.001 && count<1000) {
if(difference < 0) {
sigma_down <- sigma
sigma <- (sigma_up + sigma)/2
} else {
sigma_up <- sigma
sigma <- (sigma_down + sigma)/2
}
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
if(type=="C") {
V <- exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2))
} else {
V <- exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) }
difference <- V - marketV
count <- count + 1
}
if(count == 1000){
return(NA) # If sigma to satisfy Black76 price cannot be found
} else{
return(sigma)
}
}
sapply(split(df, seq(nrow(df))), IV)
The main thing is to run row by row through the dataframe. This is done by
sapply(split(df, seq(nrow(df))), IV)
In your original function are many errors: the biggest is accessing to S, K and so on. You might thinking taking the values from the dataframe df. But in fact you were taking the values from the workspace! I corrected this by redefining:
type <- df$type; marketV <- df$marketV; S <- df$S; K <- df$K; T <- df$T
I inserted a test for the number of rows in df, so you will get:
> IV(df)
!! nrow(df)>1 !!
[1] NA
Here is a cleaned up version of your program:
df <- data.frame(type=c("C", "C"), marketV=c(1.1166, 1.911), S=c(20, 60), K=c(20, 56), T=c(0.333, 0.5))
IV2 <- function(type, marketV, S, K, T) {
r <- 0; sigma <- 0.3
sigma_down <- 0.001; sigma_up <- 1
count <- 0
if(type=="C") {
f.sig <- function(sigma) {
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
exp(-r*T)*(S*pnorm(d1) - K*pnorm(d2)) - marketV
}
} else {
f.sig <- function(sigma) {
d1 <- (log(S/K) + (sigma^2/2)*T)/(sigma*sqrt(T))
d2 <- d1 - sigma*sqrt(T)
exp(-r*T)*(K*pnorm(-d2) - S*pnorm(-d1)) - marketV
}
}
ifelse(f.sig(sigma_down)*f.sig(sigma_up) < 0, uniroot(f.sig, c(sigma_down,sigma_up))$root, NA) # sigma
}
sapply(split(df, seq(nrow(df))), do.call, what="IV2")

replace NA by truncated normal distribution values in r

I am trying to replace NAs by truncated normal distribution values.
First I used sample as follows and the function worked:
v.new <- replace(vector,v, sample(8,length(v),replace =FALSE))
However when I try to use rtnorm it seems not to work. I got any error messages and it takes ages to replace the NAs by the desired interval. Any suggestion to make this work?
library(msm)
# Some data
data("airquality")
airquality$Ozone
# My function
add.trunc.to.NAvector <- function(vector){
v <- NULL
for(i in 1:length(vector)){
if(is.na(vector[i])==TRUE)
v <- append(v, i)
}
mean.val <- mean(vector)
sd.val <- sd(vector)
min.val <- mean.val - 4 * sd.val
max.val <- mean.val + 4 * sd.val
v.new <- replace(vector,v, rtnorm(length(v), lower = min.val, upper = max.val))
return(v.new)
}
Should not this work?
v <- airquality$Ozone
v.new <- v
indices <- which(is.na(v))
m <- mean(v[-indices])
s <- sd(v[-indices])
v.new[indices] <- rtnorm(length(indices), lower = m-4*s, upper = m+4*s)

double for loop in Binomial Tree in R

I want set up a model for interest rate in binomial tree. The interest rate is path dependent. I want return interest rate (discount factor and payoff) at every step in all scenarios(2^N). The reason I want to return every single interest rate is that I want use the interest rate is compute discount factor. I know how to do this in a complex way. Here I want to use a double loop (or something simpler) to get the results.
w is for "0" or "1" dummy variable matrix representing all scenarios.
r is interest rate. if there is a head(1), then r1=r0+u=r0+0.005; if there is a tail(0), then r1=r0-d.
D is discount factor. D1=1/(1+r0), D2=D1/(1+r1)...
P is payoff.
In this case, period N is 10. therefore, I can compute step by step. However,if N gets larger and larger, I cannot use my method. I want a simple way to compute this. Thank you.
#Real Price
N <- 10
r0 <- 0.06
K <- 0.05
u <- 0.005
d <- 0.004
q <- 0.5
w <- expand.grid(rep(list(0:1),N))
r <- D <- P <- matrix(0,0,nrow=2^N,ncol=N)
for(i in 1:dim(w)[1])
{
r[i,1] <- r0 + u*w[i,1] - d*(1-w[i,1])
r[i,2] <- r[i,1] + u*w[i,2] - d*(1-w[i,2])
r[i,3] <- r[i,2]+u*w[i,3]-d*(1-w[i,3])
r[i,4] <- r[i,3]+u*w[i,4]-d*(1-w[i,4])
r[i,5] <- r[i,4]+u*w[i,5]-d*(1-w[i,5])
r[i,6] <- r[i,5]+u*w[i,6]-d*(1-w[i,6])
r[i,7] <- r[i,6]+u*w[i,7]-d*(1-w[i,7])
r[i,8] <- r[i,7]+u*w[i,8]-d*(1-w[i,8])
r[i,9] <- r[i,8]+u*w[i,9]-d*(1-w[i,9])
r[i,10] <- r[i,9]*+u*w[i,10]-d*(1-w[i,10])
D[i,1] <- 1/(1+r0)
D[i,2] <- D[i,1]/(1+r[i,1])
D[i,3] <- D[i,2]/(1+r[i,2])
D[i,4] <- D[i,3]/(1+r[i,3])
D[i,5] <- D[i,4]/(1+r[i,4])
D[i,6] <- D[i,5]/(1+r[i,5])
D[i,7] <- D[i,6]/(1+r[i,6])
D[i,8] <- D[i,7]/(1+r[i,7])
D[i,9] <- D[i,8]/(1+r[i,8])
D[i,10] <- D[i,9]/(1+r[i,9])
P[i,1] <- D[i,1]*pmax(K-r0,0)*(0.5^N)
P[i,2] <- D[i,2]*pmax(K-r[i,1],0)*(0.5^N)
P[i,3] <- D[i,3]*pmax(K-r[i,2],0)*(0.5^N)
P[i,4] <- D[i,4]*pmax(K-r[i,3],0)*(0.5^N)
P[i,5] <- D[i,5]*pmax(K-r[i,4],0)*(0.5^N)
P[i,6] <- D[i,6]*pmax(K-r[i,5],0)*(0.5^N)
P[i,7] <- D[i,7]*pmax(K-r[i,6],0)*(0.5^N)
P[i,8] <- D[i,8]*pmax(K-r[i,7],0)*(0.5^N)
P[i,9] <- D[i,9]*pmax(K-r[i,8],0)*(0.5^N)
P[i,10] <- D[i,10]*pmax(K-r[i,9],0)*(0.5^N)
}
true.price <- sum(P)
#> true.price
# > true.price
# [1] 0.00292045
You can just use a nested loop, looping over 2:(ncol(w)) within the i loop:
for(i in 1:nrow(w)) {
r[i, 1] <- r0 + u*w[i, 1] - d*(1-w[i, 1])
D[i, 1] <- 1/(1+r0)
P[i, 1] <- D[i, 1]*pmax(K-r0, 0)*(0.5^N)
for (j in 2:(ncol(w))) {
r[i,j] <- r[i, j-1] + u*w[i, j] - d*(1-w[i, j])
D[i,j] <- D[i, j-1]/(1+r[i, j-1])
P[i,j] <- D[i, j]*pmax(K-r[i, j-1], 0)*(0.5^N)
}
}
true.price <- sum(P)

Resources