Make datasets by loop in R - r

I'm trying to learn how to make a loop in R
I have this:
sigma2 <- 0.4
a0 <- -0.1260805
b <- 0.1260805
tt <- 1:50, 1:50
z <- rnorm(50, 0, sigma2)
y <- rep(1, 50)
for(i in 1:50){
y[i]=exp(a0 + b*tt[i])*exp(z[i])
}
y
and I want to kind of test the code above 1000 times, since I want to test the hypothesis at the 0.05 level
can I treid this, and seens to be wrong:
aa <- rep(1, 1000)
for(i in 1:1000){
y[i]=exp(a0 + b1*tt[i])*exp(z[i])
}
Thanks for help!

I think this is what you want (or at least closer):
# re-write original code with vectorization:
n <- 50
sigma2 <- 0.4
a0 <- -0.1260805
b <- 0.1260805
tt <- 1:n
z <- rnorm(n, 0, sigma2)
y <- exp(a0 + b*tt)*exp(z)
# do it 20 times
result <- replicate(20, exp(a0 + b*tt)*exp(rnorm(n, 0, sigma2)))
result is a 50x20 matrix - one column per repetition.

Related

How to perform a bootstrap and find confidence interval in R

I want to create a custom bootstrap function because I want to better understand what bootstrap is doing and it seems like the other bootstrap libraries out there does not solve my issue.
The Problem: I would like to create my own wald confidence interval function where it takes in the bootstrap data, outputs the confidence interval, test the confidence interval is within a range, and gets the coverage.
Right now, I am getting this type of error:
Error in bootresults[i,}<-waldCI(y=bootdata[i], n=numTrials):number of
items to replace is not a multiple of replacement length
The goal: My goal is to get the bootresults dataset to return 4 columns(p value,One that shows the upper bound, lower bound, and whether or not the p is in the interval) and get a graph similar to this one:
Wald interval chart
Code:
set.seed(42)
samples10 <- list()
i <- 1
while(i < 100) {
sample10[[i]] <- rbinom(1500, size=10, prob=i*.01) ## rows=1500 ;columns=10
i <- i + 1
}
sample10 <- data.frame(samples10)
colnames(sample10) <- c(seq(.01, .99, .01)) ## p-values
waldconfidenceinterval <- function(y, n, alpha=0.05) {
p <- colSums(y)/(n*200)
sd <- sqrt(p*((1 - p)/(n*200)))
z <- qnorm(c(alpha/2, 1 - alpha/2))
ci <- p + z*sd
return(ci)
}
B <- 200
numTrials <- 10
bootresults <- matrix(ncol=length(sample10), nrow=B) ## rows=200, cols=99
## empty matrix in the beginning
set.seed(42)
for(i in seq_len(B)) {
bootdata <- sample10[sample(B, replace=T), ]
bootresults[i, ] <- waldCI(y=bootdata[i], n=numTrials)
## Pseudocode:
# boot_test_data$in_interval <-
# ifelse(boot_test_data$lower1 < i/100 & i/100 < boot_test_data$upper1, 1, 0)
# coverage[i] <- sum(boot_test_data$in_interval) / length(boot_test_data$in_interval)
}
Any help is greatly appreciated since I am fairly new to R.
Looks like that you want to initialize a three-dimensional array bootresults rather than a two-dimensional matrix. In your waldCI() you may use colMeans.
waldCI <- function(y, alpha=0.05) {
p <- colMeans(y)
se <- sqrt(p*(1 - p)/nrow(y))
z <- qnorm(1 - alpha/2)
ci <- p + z*se %*% cbind(lower=-1, upper=1)
return(ci)
}
B <- 200
numTrials <- 10
## initialize array
bootresults1 <- array(dim=c(ncol(samples10), 4, B),
dimnames=list(c(), c("p.values", "lower", "upper", "in.int"), c()))
set.seed(42)
for(i in seq_len(B)) {
samp <- samples10[sample(nrow(samples10), numTrials, replace=F), ]
ci <- waldCI(samp)
bootresults1[,,i] <- cbind(p.values, ci, in.int=ci[, 1] < p.values & p.values < ci[, 2])
}
coverage <- rowMeans(bootresults[,4,])
plot(p.values, coverage, type="l", main="My Plot")
Similar approach, more R-ish, though:
p.values <- seq(.01, .99, .01)
set.seed(42)
samples10 <- `colnames<-`(sapply(p.values, function(pr) rbinom(1.5e3, 1, pr)), p.values)
BOOT <- function(numTrials, ...) {
samp <- samples10[sample(nrow(samples10), numTrials, replace=F), ]
ci <- waldCI(samp, ...)
cbind(p.values, ci, in.int=ci[, 1] < p.values & p.values < ci[, 2])
}
B <- 200
numTrials <- 10
set.seed(42)
bootresults2 <- replicate(B, BOOT(numTrials=10))
stopifnot(all.equal(bootresults1, bootresults2))
Data:
Note, that I used rbinom(..., size=1, ...) to create your sample data. The use of "p" as an object name suggested that the data should be binomial.
set.seed(42)
samples10 <- matrix(nrow=1500, ncol=99, dimnames=list(c(), c(seq(.01, .99, .01))))
i <- 1
while (i < 100) {
samples10[, i] <- rbinom(1500, size=1, prob=i*.01) ## rows=1500 ;columns=10
i <- i + 1
}
Without a while loop, you could proceed vectorized:
p.values <- seq(.01, .99, .01)
set.seed(42)
samples10 <- `colnames<-`(sapply(p.values, function(pr) rbinom(1.5e3, 1, pr)), p.values)

Using random matrix in the loop (R)

I am trying to create an empirical histogram of eigenvalue spacing for random matrices using loop. Seems simple but not working so far... I am getting this error: "Error in M[k] <- (c(x[k], y[k], y[k], z[k])) :
number of items to replace is not a multiple of replacement length.”
I tried writing M[ ,k] but I still got the same error. If anyone can help me with that line, it would be great! Here is my code:
x <- rnorm(1000,0,1)
y <- rnorm(1000,0,1/2)
z <- rnorm(1000,0,1)
M <- matrix(0,2,2)
a <- rep(0,1000)
b <- rep(0,1000)
s <- rep(0,1000)
for(k in 1:1000){
M[k] =(c(x[k],y[k],y[k],z[k]))
temp = eigen(M[k])$value
a[k] <- max(temp)
b[k] <- min(temp)
s[k] <- a[k]-b[k]
}
If you are only interested in creating s, you can make your code considerably simpler by using sapply instead of a loop. You create the matrix M for each iteration and return the difference between the maximum and minimum eigenvalue. This will make your vector s without all the intermediate variables.
set.seed(69) # Makes the example reproducible
x <- rnorm(1000, 0, 1)
y <- rnorm(1000, 0, 1/2)
z <- rnorm(1000, 0, 1)
s <- sapply(seq(1000), function(k) {
M <- matrix(c(x[k], y[k], y[k], z[k]), 2, 2)
max(eigen(M)$value) - min(eigen(M)$value)
})
hist(s)
You can even get rid of x, y, z if you just sample as you go:
set.seed(69) # Makes the example reproducible
s <- sapply(seq(1000), function(k) {
M <- matrix(c(rnorm(1), rep(rnorm(1, 0, 1/2), 2), rnorm(1)), 2, 2)
max(eigen(M)$value) - min(eigen(M)$value)
})
hist(s)

Generating data and saving estimates in a loop in R

I'm a beginner with R and programming in general and i'm having some problems with this loop.
Basically i want to generate 10,000 estimates of beta_2 when n=10 and store them in a vector where the estimator in question is given by the formula (cov(x,y)/var(x)).
Ive tried the following code but it only yields the first estimate correctly and fills the other positions in the vector as NA. Any tips to solve this?
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
rep <- 10000
vect <- vector(mode="numeric", length=rep)
for(i in 1:rep){vect[i] <-(cov(X,Y) / var(X))[i]}
You must simulate the vectors X and Y inside the loop.
n <- 10
X_meanlog <- 0
X_sdlog <- 1
u_mean <- 0
u_sd <- 1
beta_1 <- 2
beta_2 <- 3
set.seed(5276) # Make the results reproducible
rept <- 10000
vect <- vector(mode="numeric", length=rept)
for(i in 1:rept){
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
vect[i] <- (cov(X, Y) / var(X))
}
mean(vect)
#[1] 3.002527
You can also run the following simpler simulation.
set.seed(5276) # Make the results reproducible
X <- replicate(rept, rlnorm(n, X_meanlog, X_sdlog))
u <- replicate(rept, rnorm(n, u_mean, u_sd))
Y <- beta_1 + beta_2 * X + u
vect2 <- sapply(seq_len(rept), function(i)
cov(X[, i], Y[, i]) / var(X[, i])
)
mean(vect2)
#[1] 3.001131

For loops for nested variables within function in R

I would like to iterate through vectors of values and calculate something for every value while being within a function environment in R. For example:
# I have costs for 3 companies
c <- c(10, 20, 30)
# I have the same revenue across all 3
r <- 100
# I want to obtain the profits for all 3 within one variable
result <- list()
# I could do this in a for loop
for(i in 1:3){
result[i] <- r - c[i]
}
Now lets assume I have a model that is very long and I define everything as a function which is to be solved with various random draws for the costs.
# Random draws
n <- 1000
r <- rnorm(n, mean = 100, sd = 10)
c1 <- rnorm(n, mean = 10, sd = 1)
c2 <- rnorm(n, mean = 20, sd = 2)
c3 <- rnorm(n, mean = 30, sd = 3)
X <- data.frame(r, c1, c2, c3)
fun <- function(x){
r <- x[1]
c <- c(x[2], x[3], x[4])
for(i in 1:3){
result[i] <- r - c[i]
}
return(result)
}
I could then evaluate the result for all draws by iterating through the rows of randomly sampled input data.
for(j in 1:n){
x <- X[j,]
y <- fun(x)
}
In this example, the output variable y would entail the nested result variable which comprises of the results for all 3 companies. However, my line of thinking results in an error and I think it has to do with the fact that I try to return a nested variable? Hence my question how you guys would approach something like this.
I would suggest rethinking your coding approach. This is a very un-R-like way of doing things.
For example, the first for loop can be written much more succinctly as
x <- c(10, 20, 30)
r <- 100
result <- lapply(-x, `+`, r)
Then fun becomes something like
fun <- function(x) lapply(-x[-1], `+`, x[1])
To then operate over the rows of a data.frame (which is what you seem to do in the last step), you can use something like
apply(X, 1, fun)
where the MARGIN = 1 argument in apply ensures that you are applying a function per row (as opposed to per column).
Here's an approach using your function and a for loop:
# Random draws
n <- 1000
r <- rnorm(n, mean = 100, sd = 10)
c1 <- rnorm(n, mean = 10, sd = 1)
c2 <- rnorm(n, mean = 20, sd = 2)
c3 <- rnorm(n, mean = 30, sd = 3)
X <- data.frame(r, c1, c2, c3)
result <- list()
fun <- function(x){
r <- x[[1]]
c <- c(x[[2]], x[[3]], x[[4]])
for(i in 1:3){
result[i] <- r - c[i]
}
return(result)
}
# Create a list to store results
profits <- rep(rep(list(1:3)),nrow(X))
# Loop throuhg each row of dataframe and store in profits.
for(i in 1:nrow(X)){
profits_temp <-
fun(list(X[i,"r"],X[i,"c1"],X[i,"c2"],X[i,"c3"]))
for(j in 1:3)
profits[[i]][[j]] <- profits_temp[[j]]
}
# Eye results
profits[[1]]
#> [1] 93.23594 81.25731 70.27699
profits[[2]]
#> [1] 80.50516 69.27517 63.36439

Simulated dataset in R

I'm simulating another dataset here, and am stuck again!
Here's what I want to do:
200 observations, with 90 independent variables (mean 0, sd 1)
the equation to create y is: y = 2x_1 + ... + 2x_30 - x_31 - ... - x_60 + 0*x_61 + ... + 0*x_90 + mu
(In other words, the first 30 x values will have a coefficient of 2, next 30 values have coefficient of -1 and last 30 values have coefficient of 0). mu is also a random generated normal variable with mean 0, sd 10.
Here's what I have so far:
set.seed(11)
n <- 200
mu <- rnorm(200,0,10)
p1 <- for(i in 1:200){
rnorm(200,0,1)
}
p2 <- cbind(p1)
p3 <- for(i in 1:90){
if i<=30, y=2x
if i>30 & i<=60, y=-x
if i>60 & i<=90, y=0x
}
I'm still learning many aspects of R, so I'm pretty sure the code has much wrong with it, even in terms of syntax. Your help would really be appreciated!
Thanks!
Try
library(mvtnorm)
coefs <- rep(c(2, -1, 0), each=30)
mu <- rnorm(200, 0, 10)
m <- rep(0, 90) # mean of independent variables
sig <- diag(90) # cov of indep variables
x <- rmvnorm(200, mean=m, sigma=sig) # generates 200 observations from multivariate normal
y <- x%*%coefs + mu
In case, if you are not comfortable with linear-algebra
n <- 200
coefs <- rep(c(2, -1, 0), each=30)
mu <- rnorm(n, 0, 10)
x <- matrix(nrow=n, ncol=90) # initializes the indep.vars
for(i in 1:90){
x[, i] <- rnorm(200, 0, 1)
}
y <- rep(NA, n) # initializes the dependent vars
for(i in 1:n){
y[i] = sum(x[i,]*coefs) + m[i]
}
x[i,]*coefs gives exactly (2*x_1,..., 2*x_30, -x_31,...,- x_60,0*x_61,...,0*x_90) because * is element-wise operation.
You'd better learn the rudimentaries of R, before actually doing something with it.

Resources