How to create a vector when using For Loop? - r

I am trying to conduct a two sided sign test from 10,000 random normal samples of size 30. I am trying to extract the p-values given from the binom.test and put them into a vector but can't quite figure out how to execute this.
set.seed(100)
sample <- matrix(rnorm(300000, mean=0.1, sd=1), 10000, 30)
success <- ifelse(sample>=0, 1, 0)
success
#sample[1,]
#success[1,]
#sum(success[1,])
#for loop
for(i in 1:10000){
pvalue<- binom.test(sum(success[i,]), 30, p=0.5,
alternative = c("two.sided"),
conf.level = 0.95)$p.value
p_values_success <- ifelse(pvalue<=0.05, 1, 0)
}

I guess what you are trying to do is
pvalue <- numeric(length = 1000L)
p_values_success <- numeric(length = 1000L)
for(i in 1:10000) {
pvalue[i] <- binom.test(sum(success[i,]), 30, p=0.5,
alternative = c("two.sided"),
conf.level = 0.95)$p.value
p_values_success[i] <- ifelse(pvalue[i]<=0.05, 1, 0)
}
However, if I had to rewrite you code completely from scratch I would do
set.seed(100)
sample <- matrix(rnorm(300000, mean=0.1, sd=1), 10000, 30)
success[] <- as.integer(sample >=0)
t(apply(success, 1, function(x) {
p_val <- binom.test(sum(x), 30, p=0.5,alternative = c("two.sided"),
conf.level = 0.95)$p.value
c(p_val, as.integer(p_val<=0.05))
}))
This will return a 2-column matrix where 1st column is pvalue and the second one is p_values_success.

You could also do:
apply(success, 1,
FUN = function(x)
ifelse(
binom.test(sum(x), 30, p = 0.5,
alternative = "two.sided", conf.level = 0.95)$p.value <= 0.05, 1, 0
)
)

Related

R bootstrapping for the two dataframe individual column wise

Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)

MCMC for estimating negative binomial distribution

I want to estimate parameters of negative binomial distribution using MCMC Metropolis-Hastings algorithm. In other words, I have sample:
set.seed(42)
y <- rnbinom(20, size = 3, prob = 0.2)
and I want to write algorithm that will estimate parameter of size and parameter of prob.
My work so far
I defined prior distribution of size as Poisson:
prior_r <- function(r) {
return(dpois(r, lambda = 2, log = T))
}
And prior distribution of prob as uniform on [0, 1]:
prior_prob <- function(prob) {
return(dunif(prob, min = 0, max = 1, log = T))
}
Moreover for simplicity I defined loglikelihood and joint probability functions:
loglikelihood <- function(data, r, prob) {
loglikelihoodValue <- sum(dnorm(data, mean = r, sd = prob, log = T))
return(loglikelihoodValue)
}
joint <- function(r, prob) {
data <- y
return(loglikelihood(data, r, prob) + prior_r(r) + prior_prob(prob))
}
Finally, the whole algorithm:
run_mcmc <- function(startvalue, iterations) {
chain <- array(dim = c(iterations + 1, 2))
chain[1, ] <- startvalue
for (i in 1:iterations) {
proposal_r <- rpois(1, lambda = chain[i, 1])
proposal_prob <- chain[i, 2] + runif(1, min = -0.2, max = 0.2)
quotient <- joint(proposal_r, proposal_prob) - joint(chain[i, 1], chain[i, 2])
if (runif(1, 0, 1) < min(1, exp(quotient))) chain[i + 1, ] <- c(proposal_r, proposal_prob)
else chain[i + 1, ] <- chain[i, ]
}
return(chain)
}
The problem
Problem that I'm having is that when I run it with starting values even very close to correct ones:
iterations <- 2000
startvalue <- c(4, 0.25)
res <- run_mcmc(startvalue, iterations)
I'll obtain posterior distribution which is obviously wrong. For example
> colMeans(res)
[1] 11.963018 0.994533
As you can see, size is located very close to point 12, and probability is located in point 1.
Do you know what's the cause of those phenomeons?
Change dnorm in loglikelihood to dnbinom and fix the proposal for prob so it doesn't go outside (0,1):
set.seed(42)
y <- rnbinom(20, size = 3, prob = 0.2)
prior_r <- function(r) {
return(dpois(r, lambda = 2, log = T))
}
prior_prob <- function(prob) {
return(dunif(prob, min = 0, max = 1, log = TRUE))
}
loglikelihood <- function(data, r, prob) {
loglikelihoodValue <- sum(dnbinom(data, size = r, prob = prob, log = TRUE))
return(loglikelihoodValue)
}
joint <- function(r, prob) {
return(loglikelihood(y, r, prob) + prior_r(r) + prior_prob(prob))
}
run_mcmc <- function(startvalue, iterations) {
chain <- array(dim = c(iterations + 1, 2))
chain[1, ] <- startvalue
for (i in 1:iterations) {
proposal_r <- rpois(1, lambda = chain[i, 1])
proposal_prob <- chain[i, 2] + runif(1, min = max(-0.2, -chain[i,2]), max = min(0.2, 1 - chain[i,2]))
quotient <- joint(proposal_r, proposal_prob) - joint(chain[i, 1], chain[i, 2])
if (runif(1, 0, 1) < min(1, exp(quotient))) {
chain[i + 1, ] <- c(proposal_r, proposal_prob)
} else {
chain[i + 1, ] <- chain[i, ]
}
}
return(chain)
}
iterations <- 2000
startvalue <- c(4, 0.25)
res <- run_mcmc(startvalue, iterations)
colMeans(res)
#> [1] 3.1009495 0.1988177

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

arima.sim() function with varying: sample sizes, phi values and sd values

I want to simulate ARIMA(1,1,0) with varying:
sample sizes
phi values
standard deviation values.
I admire how the bellow r code is simulating just one ARIMA(1,1,0) which I want to follow the format to simulate many ARIMA(1,1,0) with varying sample sizes, phi values and standard deviation values
wn <- rnorm(10, mean = 0, sd = 1)
ar <- wn[1:2]
for (i in 3:10){
ar<- arima.sim(n=10,model=list(ar=-0.7048,order=c(1,1,0)),start.innov=4.1,n.start=1,innov=wn)
}
I have asked a similar question here and given a good answer based on my question, but now I see that arima.sim() function is indispensable in simulating ARIMA time series and therefore want to incorporate it into my style of simulating ARIMA time series.
I come up with this trial that uses arima.sim() function to simulate N=c(15, 20) ARIMA(1,1,0) time series with varying sample sizes, standard deviation values and phi values by first generating N random number and then using the initial two random number to be the first two ARIMA(1,1,0). The 3rd to **n**th are the made to followARIMA(1,1,0)`.
Here is what I have tried bellow:
N <- c(15L, 20L)
SD = c(1, 2) ^ 2
phi = c(0.2, 0.4)
res <- vector('list', length(N))
names(res) <- paste('N', N, sep = '_')
set.seed(123L)
for (i in seq_along(N)){
res[[i]] <- vector('list', length(SD))
names(res[[i]]) <- paste('SD', SD, sep = '_')
ma <- matrix(NA_real_, nrow = N[i], ncol = length(phi))
for (j in seq_along(SD)){
wn <- rnorm(N[i], mean = 0, sd = SD[j])
ar[[1:2, ]] <- wn[[1:2]]
for (k in 3:N[i]){
ar[k, ] <- arima.sim(n=N[[i]],model=list(ar=phi[[k]],order=c(1,1,0)),start.innov=4.1,n.start=1,innov=wn)
}
colnames(ar) <- paste('ar_theta', phi, sep = '_')
res[[i]][[j]] <- ar
}
}
res1 <- lapply(res, function(dat) do.call(cbind, dat))
sapply(names(res1), function(nm) write.csv(res1[[nm]],
file = paste0(nm, ".csv"), row.names = FALSE, quote = FALSE))
The last two lines write the time series data in .csv and save it in my working directory.
Here may be a method using Map. Please edit your post to include expected output if this does not meet your requirements.
N <- c(15L, 20L)
SD <- c(1, 2) ^ 2
phi = c(0.2, 0.4)
## generate all combos
all_combos <- expand.grid(N = N, SD = SD, phi = phi)
## create function
fx_arima <- function(n, SD, phi) {
arima.sim(n = n,
model=list(ar=phi, order = c(1, 1, 0)),
start.innov = 4.1,
n.start = 1,
rand.gen = function(n) rnorm(n, mean = 0, sd = SD))[-1L]
}
## find arima for all combos using Map
set.seed(123L)
res = Map(fx_arima, all_combos[["N"]], all_combos[["SD"]], all_combos[["phi"]])
## or a little bit more work:
set.seed(123L)
res2 = by(all_combos, all_combos["N"],
function(DF) {
res = mapply(fx_arima, DF[["N"]], DF[["SD"]], DF[["phi"]])
colnames(res) = paste("SD", DF[["SD"]], "phi", DF[["phi"]], sep = "_")
res
})
res2
## write to csv
Map(function(file, DF) write.csv(DF, paste0("N_", file, ".csv")), names(res2), res2)

Simulation for Confidence interval in R

I have an R function that provides the 95% confidence Interval for the ncp (non-centrality parameter) of a t distribution.
Via simulation in R, is it possible to show that in the long-run the CIs from this R function capture a given TRUE ncp (here "2" same as input t) 95% of the time?
(I appreciate any ideas as to how to do this)
CI.ncp <- function(t, N){
f <- function (ncp, alpha, q, df) {
abs(suppressWarnings(pt(q = t, df = N - 1, ncp, lower.tail = FALSE)) - alpha) }
sapply(c(0.025, 0.975),
function(x) optim(1, f, alpha = x, q = t, df = N - 1, control = list(reltol = (.Machine$double.eps)))[[1]]) }
#Example of Use:
CI.ncp(t = 2, N = 20) # gives: -0.08293755 4.03548862
#(in the long-run 95% of the time, "2" is contained within these
# two numbers, how to show this in R?)
Here is what I have tried with no success:
fun <- function(t = 2, N = 20){
ncp = rt(1, N - 1, t)
CI.ncp(t = 2, N = 20)
mean(ncp <= 2 & 2 <= ncp )
}
R <- 1000
sim <- t(replicate(R, fun()))
coverage <- mean(sim[,1] <= 2 & 2 <= sim[,2])
The problem is the that we need to feed the random ncp obtained from the fun in the CI.ncp:
fun <- function(t = 2, N = 20){ ;
ncp = rt(1, N - 1, t);
CI.ncp(t = ncp, N = 20);
}
R <- 1e4 ;
sim <- t(replicate(R, fun()));
coverage <- mean(sim[,1] <= 2 & 2 <= sim[,2])
I would use package MBESS.
#install.packages("MBESS")
library(MBESS)
fun <- function(t = 2, N = 20, alpha = 0.95){
x = rt(1, N - 1, t)
conf.limits.nct(x, df = N, conf.level = alpha)[c(1, 3)]
}
set.seed(5221)
R <- 1000
sim <- t(replicate(R, fun()))
head(sim)
coverage <- mean(sim[,1] <= 2 & 2 <= sim[,2])
coverage
[1] 0.941

Resources