I am new to R and trying to build a accumulative binomial distribution table and got stuck in the loop.
r = readline("please enter an interger n:")
p = seq(from = 0.1, to = 1,by = 0.1 )
r = seq(from = 0, to = 100)
n <- ""
for (each in r) {
x=qbinom(x,r,p)
}
print(x)
As an alternate to the loop: you can use expand.grid to create all permutations of k and p, and further avoid the loop as pbinom can take vectors.
# Create input values
p = 1:9/10
k = 0:25
n = 25
# Create permutations of `k` and `p`: use this to make grid of values
ex <- expand.grid(p=p, k=k)
# Find probabilities for each value set
ex$P <- with(ex, pbinom(k, n, p ))
# Reshape to your required table format
round(reshape2::dcast(k ~ p, data=ex, value.var = "P"), 3)
Loop approach
# Values to match new example
p = 1:19/20
k = 0:25
n = 4
# Create matrix to match the dimensions of our required output
# We will fill this as we iterate through the loop
mat1 <- mat2 <- matrix(0, ncol=length(p), nrow=length(k), dimnames=list(k, p))
# Loop through the values of k
# We will also use the fact that you can pass vectors to `pbinom`
# so for each value of `k`, we pass the vector of `p`
# So we will update each row of our output matrix with
# each iteration of the loop
for(i in seq_along(k)){
mat1[i, ] <- pbinom(k[i], n, p)
}
Just for completeness, we could of updated the columns of our output matrix instead - that is for each value of p pass the vector k
for(j in seq_along(p)){
mat2[, j] <- pbinom(k, n, p[j])
}
# Check that they give the same result
all.equal(mat1, mat2)
Related
I would like to perform a Sobol sensitivity analysis in R
The package "sensitivity" should allow me to do so, but I don't understand how to generate the sampling matrixes (X1, X2). I have a model that runs outside of R. I have 6 parameters with uniform distribution.
In my text book: N = (2k+2)*M ; M = 2^b ; b=[8,12] (New sampling method : Wu et al. 2012)
I had the feeling that I should create two sampling matrix and feed the two to the sobol function X1_{M,k} X2_{M,k}.
The dimension of final sampling matrix x$X is then (k+2)*M. because:
X <- rbind(X1, X2)
for (i in 1:k) {
Xb <- X1
Xb[, i] <- X2[, i]
X <- rbind(X, Xb)
}
How should I conduct my sampling to get the right number of runs as (2*k+2)*M ?
This script is for the old method but does someone know if the new method is already implemented yet in the sensitivity package? Feel free to comment this procedure
name = c("a" , "b" , "c" , "d" , "e", "f")
vals <- list(list(var="a",dist="unif",params=list(min=0.1,max=1.5)),
list(var="b",dist="unif",params=list(min=-0.3,max=0.4)),
list(var="c",dist="unif",params=list(min=-0.3,max=0.3)),
list(var="d",dist="unif",params=list(min=0,max=0.5)),
list(var="e",dist="unif",params=list(min=2.4E-5,max=2.4E-3)),
list(var="f",dist="unif",params=list(min=3E-5,max=3E-3)))
k = 6
b = 8
M = 2^b
n <- 2*M
X1 <- makeMCSample(n,vals, p = 1)
X2 <- makeMCSample(n,vals, p = 2)
x <- sobol2007(model = NULL, X1, X2, nboot = 200)
if I understand correctly, I should provide a y for each x$X sampling combination
then I can use the function "tell" which will generate the Sobol' first-order indices as well as the total indices
tell(x,y)
ggplot(x)
Supplemental R function SobolR
makeMCSample <- function(n, vals) {
# Packages to generate quasi-random sequences
# and rearrange the data
require(randtoolbox)
require(plyr)
# Generate a Sobol' sequence
if (p == 2){ sob <- sobol(n, length(vals), seed = 4321, scrambling = 1)
}else{sob <- sobol(n, length(vals), seed = 1234, scrambling = 1)}
# Fill a matrix with the values
# inverted from uniform values to
# distributions of choice
samp <- matrix(rep(0,n*(length(vals)+1)), nrow=n)
samp[,1] <- 1:n
for (i in 1:length(vals)) {
# i=1
l <- vals[[i]]
dist <- l$dist
params <- l$params
fname <- paste("q",dist,sep="")
samp[,i+1] <- do.call(fname,c(list(p=sob[,i]),params))
}
# Convert matrix to data frame and add labels
samp <- as.data.frame(samp)
names(samp) <- c("n",laply(vals, function(l) l$var))
return(samp)
}
ref: Qiong-Li Wu, Paul-Henry Cournède, Amélie Mathieu, 2012, Efficient computational method for global sensitivity analysis and its application to tree growth modelling
I am trying to write a function to calculate a gradient in R. The function must specifically do so using a for loop. I am writing this function in order to illustrate how a for loop is less efficient than vectorized programming when trying to calculate the gradient.
The function takes in a design matrix X and a vector of coefficients beta in order to calculate the gradient of a cost function (The design matrix is a matrix of covariates with ones on the first column.)
The cost function is the MSE,
. I am calculating the gradient using an analytical solution, taking the partial derivative of the loss function. This gives us the partial derivative as ,
for the first coefficient $\beta_{0}$ and then similarly for all other coefficients $\beta_{j}$,
I managed to calculate the answer (implementing the above) using vectorized programming as the line of code below.
-2*t(X)%*%(y-X%*%beta)
My attempt at using a for loop, does not work but it looked like this,
# Initialize the matrix to hold gradient
gradient = matrix(0, nrow = nrow(beta))
for(i in 1:nrow(beta)){
if(i == 1){
gradient[i] = -2*sum(y - X%*%beta) # first value
}else if(i>1){
gradient[i] = -2*sum( t(X)%*%(y - X%*%beta) ) * apply(X[,-1],2,sum)[i-1]
} }
Below is the code to generate data to use and the two implementations that I tried in R. The code generates the data and can be used to test fixing the for loop if copied into R.
# Values
# Random data Generated
n = 4
p = 3
beta_0 = 2
beta = matrix(c(beta_0,3,1,4), nrow= (p+1) ) # coefficients
X = runif(n=(n*p), min=-5,max= 5) # covariates
X = matrix(X, nrow = n, ncol = p)
X = cbind(1, X) # make a design matrix
y = apply(X[,-1],1,mean) # Response (Some function) #
# Print all to show all initial values
print(list("Initial Values:"="","n:"=n," p:" = p, "beta_0:" = beta_0," beta:"=beta,
" X:"=X," y:" = y))
# Function 1
# Find the gradient (using a for loop)
# The partial derivative of the loss function
df_ols = function(beta,X,y){
begin.time <- proc.time()
# Initialize the matrix to hold gradient
gradient = matrix(0, nrow = nrow(beta))
for(i in 1:nrow(beta)){
if(i == 1){
gradient[i] = -2*sum(y - X%*%beta) # first value
}else if(i>1){
gradient[i] = -2*sum( t(X)%*%(y - X%*%beta) ) * apply(X[,-1],2,sum)[i-1]
} }
end.time <- proc.time()
time <- (end.time-begin.time)[3]
print(list("gradient 1"=gradient,"time"=time))
}
df_ols(beta,X,y)
# Function 2
# Find the gradient Approach 2 using vectorized programming
gradient_3 <- function(X, beta){
begin.time <- proc.time()
# Finding the gradient
grad_3 <- -2*t(X)%*%(y-X%*%beta)
grad_3 <- matrix(grad_3, ncol = 1,nrow = ncol(X)) # Turn into a column matrix
end.time <- proc.time()
time <- (end.time-begin.time)[3]
print(list("gradient 3"= grad_3 ,"time"=time))
}
gradient_3(X, beta) # Assumed Correct
I apologize if I was not too wordy. Any help would be appreciated.
I managed to get the for loop to work, below you'll see the answer.
# Find the gradient (using a for loop) explicit, element-wise formulations
# The partial derivative of the loss function
df_ols = function(beta,X,y){
begin.time <- proc.time()
# Initialize the matrix to hold gradient
gradient = matrix(0, nrow = nrow(beta))
for(i in 1:nrow(beta)){
if(i == 1){
gradient[i] = -2*sum(y - X%*%beta) # first value
}else if(i>1){
gradient[i] = -2*t(X[,i])%*%(y-X%*%beta)
#gradient[i] = -2*sum( t(X)%*%(y - X%*%beta) ) * apply(X[,-1],2,sum)[i-1]
} }
end.time <- proc.time()
time <- (end.time-begin.time)[3]
print(list("gradient 1"=gradient,"time"=time))
}
df_ols(beta,X,y)
, running both the vectorized and elementwise formations we find the vectorized process is much faster. We implement the full process below.
# Random data Generated
n = 100000*12
p = 3
beta_0 = 2
beta = matrix(c(beta_0,3,1,4), nrow= (p+1) ) # coefficients
X = runif(n=(n*p), min=-5,max= 5) # covariates
X = matrix(X, nrow = n, ncol = p)
X = cbind(1, X) # make a design matrix
y = apply(X[,-1],1,mean) # Response (Some function)
# Print parameters. To show all initial values
print(list("Initial Values:" = '',"n:"=n," p:" = p, "beta_0:" = beta_0," beta:"=beta,
" X:"=round(X,digits=2)," y:" = round(y,digits=2)))
# Function 3
# Find the gradient Approach 3, using vectorized programming
gradient_3 <- function(X, beta,y){
begin.time <- proc.time()
# Find the partial derivative of the rest (j'th)
grad_3 <- -2*t(X)%*%(y-X%*%beta)
grad_3 <- matrix(grad_3, ncol = 1,nrow = ncol(X)) # Turn into a column matrix
end.time <- proc.time()
time <- (end.time-begin.time)[3]
print(list("gradient 3"= grad_3 ,"time"=time))
}
Showing the results
> df_ols(beta,X,y) # Elementwise
$`gradient 1`
[,1]
[1,] 4804829
[2,] 53311879
[3,] 13471077
[4,] 73259191
$time
elapsed
3.59
> gradient_3(X, beta,y) # Vectorized Programming solution
$`gradient 3`
[,1]
[1,] 4804829
[2,] 53311879
[3,] 13471077
[4,] 73259191
$time
elapsed
0.89
library(GoFKernel)
library(ggplot2)
rejection_fx_sqz <- function(n){
x <- vector() # the output vector in which simulated values should be stored
acpt <- 0 # count the accepted values
tol <- 0 # count the total number of values (accepted or not accepted)
len_x = 0
while(len_x < n){
n_to_gen = max((n-len_x)/0.69,20) # determine number to generate - not less than 20
tol = tol + n_to_gen # count the total number of values simulated
u1 = runif(n_to_gen) # simulate u1
u2 = runif(n_to_gen) # simulate u2
y = inv_G(u2)
g <- g_x(y)
d <- g*y*(5-y)
condU <- (M*u1) >= 1/d
condL <- !condU
condL[condL] <- (M*u1[condL]) <= lower(y[condL])/d[condL]
other <- !(condU | condL) # condition of below the W_U and above W_L
# modify condL because some samples can still be accepted given condL is false
condL[other] <- u1[other] <= fstar(y[other])/(M*g[other])
cond <- condL
acpt = acpt + sum(cond) # count the number of accepted values
x <- c(x, y[cond]) # add accepted values to the output vector
len_x <- length(x)
}
p = acpt / tol
return(list(x=x[1:n], p=p))
}
n=100000
x=rejection_fx_sqz(n) # a function that simulates from f(x) by generating n samples
x_fx <- data.frame(x=x$x)
x=x_fx$x
x_plot = cbind(x_fx, fy = 1/(I*x*(5-x))*exp(-1/8*(-1+log(x/(5-x)))^2))
f_cdf <- function(x) {
integrate(fstar, 0, x)$value/I
}
# quantile function, inverse cdf
f_q <- inverse(f_cdf, lower=0.000000000000000001, upper=4.999999999999999999)
ggplot(x_plot, aes(sample=x))+
labs(title="Empirical against theoretical quantiles")+
stat_qq(distribution=f_q) +
stat_qq_line(distribution=f_q)
What I'm trying to do is to produce a 'quantile-quantile' diagnostic plot for my algorithm for simulating from f(x). The problem is that I keep getting two error messages which says:
Computation failed in stat_qq(): unused argument (p = quantiles)
Computation failed in stat_qq_line(): unused argument (p = quantiles)
I am beginner for r-language and this is driving me crazy. Any help is appreciated.
I know I can use expand.grid for this, but I am trying to learn actual programming. My goal is to take what I have below and use a recursion to get all 2^n binary sequences of length n.
I can do this for n = 1, but I don't understand how I would use the same function in a recursive way to get the answer for higher dimensions.
Here is for n = 1:
binseq <- function(n){
binmat <- matrix(nrow = 2^n, ncol = n)
r <- 0 #row counter
for (i in 0:1) {
r <- r + 1
binmat[r,] <- i
}
return(binmat)
}
I know I have to use probably a cbind in the return statement. My intuition says the return statement should be something like cbind(binseq(n-1), binseq(n)). But, honestly, I'm completely lost at this point.
The desired output should basically recursively produce this for n = 3:
binmat <- matrix(nrow = 8, ncol = 3)
r <- 0 # current row of binmat
for (i in 0:1) {
for (j in 0:1) {
for (k in 0:1) {
r <- r + 1
binmat[r,] <- c(i, j, k)}
}
}
binmat
It should just be a matrix as binmat is being filled recursively.
I quickly wrote this function to generate all N^K permutations of length K for given N characters. Hope it will be useful.
gen_perm <- function(str=c(""), lst=5, levels = c("0", "1", "2")){
if (nchar(str) == lst){
cat(str, "\n")
return(invisible(NULL))
}
for (i in levels){
gen_perm(str = paste0(str,i), lst=lst, levels=levels)
}
}
# sample call
gen_perm(lst = 3, levels = c("x", "T", "a"))
I will return to your problem when I get more time.
UPDATE
I modified the code above to work for your problem. Note that the matrix being populated lives in the global environment. The function also uses the tmp variable to pass rows to the global environment. This was the easiest way for me to solve the problem. Perhaps, there are other ways.
levels <- c(0,1)
nc <- 3
m <- matrix(numeric(0), ncol = nc)
gen_perm <- function(row=numeric(), lst=nc, levels = levels){
if (length(row) == lst){
assign("tmp", row, .GlobalEnv)
with(.GlobalEnv, {m <- rbind(m, tmp); rownames(m) <- NULL})
return(invisible(NULL))
}
for (i in levels){
gen_perm(row=c(row,i), lst=lst, levels=levels)
}
}
gen_perm(lst=nc, levels=levels)
UPDATE 2
To get the expected output you provided, run
m <- matrix(numeric(0), ncol = 3)
gen_perm(lst = 3, levels = c(0,1))
m
levels specifies a range of values to generate (binary in our case) to generate permutations, m is an empty matrix to fill up, gen_perm generates rows and adds them to the matrix m, lst is a length of the permutation (matches the number of columns in the matrix).
I want to perform a bootstrap simulation 1000 times and compute percentile confidence intervals 1000 times for different samplesizes n = 10,20,...,100. I've solved this problem and I'm just asking, instead of doing this huge computations 10 times, covering 300 lines of code, is there a way to shorten this? Like, running this function over and over again 10 times? I tried a for-loop but it did not work. Here is the code that does work:
B = 1000 # number of replicates
kHat = Parameters[1] # approx = 2.06786
gammaHat = Parameters[2] # approx = 0.51144
TheoreticalMean = kHat/gammaHat
TheoreticalVariance = kHat/gammaHat^2
PercCoverage = vector("numeric", 10L)
n = 10 # sample size
getCI = function(B, k, gamma, n) {
getM = function(orgData, idx) {
bsM = mean(orgData[idx])
bsS2M = (((n-1) / n) * var(orgData[idx])) / n
c(bsM, bsS2M)
}
F = rgamma(n, kHat, gammaHat) # simulated data: original sample
M = mean(F) # M from original sample
S2M = (((n-1)/n)*var(F))/n # S^2(M) from original sample
# bootstrap
boots = t(replicate(B, getM(F, sample(seq(along=F), replace=TRUE))))
Mstar = boots[,1] # M* for each replicate
S2Mstar = boots[,2] # S^2*(M) for each replicate
biasM = mean(Mstar)-M # bias of estimator M
# indices for sorted vector of estimates
idx = trunc((B+1)*c(0.05/2,1-0.05/2))
ciPerc = sort(Mstar)[idx] # percentile CI
c(perc=ciPerc)
}
# 1000 bootstraps
Nrep <- 1000 # number of bootstraps
CIs <- t(replicate(Nrep, getCI(B, kHat, gammaHat, n)))
# coverage probabilities
PercCoverage[1] = sum((CIs[,"perc1"]<TheoreticalMean) & (CIs[,"perc2"]>TheoreticalMean)) / Nrep
However, here I need to script this for n=10, n=20 and so on to n=100, and each time I need to change PercCoverage[1] to PercCoverage[2]...PercCoverage[10] in order to store these values in an array for later plotting.
I tried setting n=c(10,20,30,40,50,60,70,80,90,100) and then placing all of the above in a for loop but the function getCI needed numerical value.
EDIT: For loop attempt:
n = c(10,20,30,40,50,60,70,80,90,100)
B = 1000 # number of replicates
kHat = Parameters[1] # approx = 2.06786
gammaHat = Parameters[2] # approx = 0.51144
TheoreticalMean = kHat/gammaHat
TheoreticalVariance = kHat/gammaHat^2
PercCoverage = vector("numeric", 10L)
for (i in length(n)){
getCI = function(B, k, gamma, n[i]) {
getM = function(orgData, idx) {
bsM = mean(orgData[idx])
bsS2M = (((n[i]-1) / n[i]) * var(orgData[idx])) / n[i]
c(bsM, bsS2M)
}
F = rgamma(n[i], kHat, gammaHat) # simulated data: original sample
M = mean(F) # M from original sample
S2M = (((n[i]-1)/n[i])*var(F))/n[i] # S^2(M) from original sample
# bootstrap
boots = t(replicate(B, getM(F, sample(seq(along=F), replace=TRUE))))
Mstar = boots[,1] # M* for each replicate
S2Mstar = boots[,2] # S^2*(M) for each replicate
biasM = mean(Mstar)-M # bias of estimator M
# indices for sorted vector of estimates
idx = trunc((B+1)*c(0.05/2,1-0.05/2))
ciPerc = sort(Mstar)[idx] # percentile CI
c(perc=ciPerc)
}
# 1000 bootstraps
Nrep <- 1000 # number of bootstraps
CIs <- t(replicate(Nrep, getCI(B, kHat, gammaHat, n[i])))
# coverage probabilities
PercCoverage[i] = sum((CIs[,"perc1"]<TheoreticalMean) & (CIs[,"perc2"]>TheoreticalMean)) / Nrep
}
Consider defining multiple functions: a master one boostrap_proc, gCI, and getM. Then pass in your sequences of sample sizes in lapply for list return or sapply for numeric vector each calling the master function and returning a series of probabilities (last line of function). Be sure to remove the hard coded n = 10.
Define Functions
B = 1000 # number of replicates
kHat = Parameters[1] # approx = 2.06786
gammaHat = Parameters[2] # approx = 0.51144
TheoreticalMean = kHat/gammaHat
TheoreticalVariance = kHat/gammaHat^2
bootstrap_proc <- function(n) {
Nrep <- 1000 # 1000 bootstraps
CIs <- t(replicate(Nrep, getCI(B, kHat, gammaHat, n)))
# coverage probabilities
sum((CIs[,"perc1"]<TheoreticalMean) & (CIs[,"perc2"]>TheoreticalMean)) / Nrep
}
getCI <- function(B, k, gamma, n) {
F <- rgamma(n, kHat, gammaHat) # simulated data: original sample
M <- mean(F) # M from original sample
S2M <- (((n-1)/n)*var(F))/n # S^2(M) from original sample
# bootstrap
boots <- t(replicate(B, getM(F, sample(seq(along=F), replace=TRUE),n)))
Mstar <- boots[,1] # M* for each replicate
S2Mstar <- boots[,2] # S^2*(M) for each replicate
biasM <- mean(Mstar)-M # bias of estimator M
# indices for sorted vector of estimates
idx <- trunc((B+1)*c(0.05/2,1-0.05/2))
ciPerc <- sort(Mstar)[idx] # percentile CI
c(perc=ciPerc)
}
getM <- function(orgData, idx, n) {
bsM <- mean(orgData[idx])
bsS2M <- (((n-1) / n) * var(orgData[idx])) / n
c(bsM, bsS2M)
}
Call Function
sample_sizes <- c(10,20,30,40,50,60,70,80,90,100)
# LIST
PercCoverage <- lapply(sample_sizes, bootstrap_proc)
# VECTOR
PercCoverage <- sapply(sample_sizes, bootstrap_proc)
# VECTOR
PercCoverage <- vapply(sample_sizes, bootstrap_proc, numeric(1))