Bootstrap correlation CI with increasing sample size - r

I would like to demonstrate how the width of a 95% confidence interval around a correlation changes with increasing sample size, from n = 10 to n=100 in increments of 5 samples per round. I would assume we can use a bootstrap function to do this and replicate each round 1000 times. How can this be done in R?
See:
http://www.nicebread.de/at-what-sample-size-do-correlations-stabilize/
We can use the diamonds data:
data(diamonds)
x <- diamonds$price
y <- diamonds$carat

You can add the chart and axis titles yourself, but this code does what I think you're looking for using ggplot2 and the 'psychometric' package:
library(ggplot2)
library(psychometric)
corSamp <- function(x) {
# return the correlation between price and carat on diamonds for a given sample size
index <- sample(1:nrow(diamonds), x)
carat <- diamonds$carat[index]
price <- diamonds$price[index]
return(cor(carat, price))
}
cors <- sapply(seq(5,100,5), corSamp)
lower <- sapply(1:20, function(i) return(CIr(r = cors[i], n = seq(5,100,5)[i], level = 0.95)[1]))
upper <- sapply(1:20, function(i) return(CIr(r = cors[i], n = seq(5,100,5)[i], level = 0.95)[2]))
myData <- data.frame(cbind(cors, lower, upper, seq(5,100,5)))
myPlot <- ggplot(myData, aes(x = V4, y = cors)) + geom_line() + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.5)
Here V4 is the sample size.

You can loop through your sample sizes with sapply, and for each sample size draw 1000 random samples of the appropriate size, reporting the average width of the confidence interval:
set.seed(144)
ci.widths <- sapply(seq(10, 100, 5), function(x) mean(replicate(1000, {
r <- sample(nrow(diamonds), x, replace=TRUE)
diff(cor.test(diamonds$price[r], diamonds$carat[r])$conf.int)
})))
plot(seq(10, 100, 5), ci.widths, xlab="Sample size", ylab="CI width")

Related

How do I get a smooth curve from a few data points, in R?

I am trying to plot the rate 1/t as it changes with mue. The code is given below and I have highlighted the relevant lines with input and output.
library("deSolve")
library("reshape")
library("tidyverse")
Fd <- data.frame()
MUES <- c(100, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 100010, 100020, 100050, 100060, 100080, 100090, 100100, 100500) # <------ THIS IS THE INPUT
for (i in 1:length(MUES)){
parameters <- c(tau = 0.005, tau_r = 0.0025, mui=0, Ve=0.06, Vi=-0.01, s=0.015, mue=MUES[i])
state <- c(X = 0.015, Y = 0)
Derivatives <-function(t, state, parameters) {
#cc <- signal(t)
with(as.list(c(state, parameters)),{
# rate of change
dX <- -(1/tau + mue - mui)*X + (Y-X)/tau_r + mue*Ve - mui*Vi
dY <- -Y/tau + (X-Y)/tau_r
# return the rate of change
list(c(dX, dY))
}) # end with(as.list ...
}
times <- seq(0, 0.1, by = 0.0001)
out <- ode(y = state, times = times, func = Derivatives, parms = parameters)
out.1 <- out %>%
as.data.frame() %>% summarise(d = min(times[Y >=0.015]))
Time <- out.1$d
localdf <- data.frame(t=Time, rate= 1/Time, input=MUES[i])
Fd <- rbind.data.frame(Fd, localdf)}. # <----- THIS IS THE DATAFRAME WITH OUTPUT AND INPUT
spline_int <- as.data.frame(spline(Fd$input, Fd$rate))
ggplot(Fd) +
geom_point(aes(x = input, y = rate), size = 3) +
geom_line(data = spline_int, aes(x = x, y = y))
The rate 1/t has a limiting value at 1276 and thats why I have taken quite a few values of mue in the end, to highlight this. I get a graph like this:
What I want is something like below, so I can highlight the fact that the rate 1/t doesn't grow to infinity and infact has a limiting value. The below figure is from the Python question.
How do I accomplish this in R? I have tried loess and splines and geom_smooth (but just with changing span), perhaps I am missing something obvious.
Splines are polynomials with multiple inflection points. It sounds like you instead want to fit a logarithmic curve:
# fit a logarithmic curve with your data
logEstimate <- lm(rate~log(input),data=Fd)
# create a series of x values for which to predict y
xvec <- seq(0,max(Fd$input),length=1000)
# predict y based on the log curve fitted to your data
logpred <- predict(logEstimate,newdata=data.frame(input=xvec))
# save the result in a data frame
# these values will be used to plot the log curve
pred <- data.frame(x = xvec, y = logpred)
ggplot() +
geom_point(data = Fd, size = 3, aes(x=input, y=rate)) +
geom_line(data = pred, aes(x=x, y=y))
Result:
I borrowed some of the code from this answer.

Plotting from a for loop

set.seed(666999)
PLOT <- function(n){
n_samples<-10
#Creating matrix to store generated data
data_matrix<-matrix(ncol =n_samples, nrow= n)
for(j in 1:n_samples){
#Generating 10 standard normal samples of size n
data_matrix[ , j]<-rnorm(n=n, mean = 0, sd = 1)
}
for(k in 1:n_samples){
sam_20<-data_matrix[ , k]
# extracting each of the kth samples
#Ploting
Q_Qplot<-qqnorm( sam_20 )
Q_QplotL<-qqline( sam_20 )
#Q_Qplot<-gg_qqplot(sam_20, ylab="Sample Quantiles",
#xlab = "Theoritical Quantiles",
#main= bquote("Q-Q plot for sample size of "
# ~ n == ~ .(n)))#quantile-quantile plot
}
# return(n)
#return(Q_Qplot)
#return(Q_QplotL)
}
#layout_matrix_1 <- matrix(1:10, ncol = 5) # Define position matrix
#layout(layout_matrix_1)
PLOT(100)
PLOT(50)
PLOT(20)
PLOT(10)
For each sample size, I am generating ten plots. I want the layout for each of my 10 plots to fit well on a page with reasonable height and width. For some reason, I can't get it to work the way I wanted. I need some assistance perhaps with ggplot2 which has a better aesthetic view. Thanks
An approach with tidyverse
library(tidyverse)
#Values to simulate
sample_size <- c(10,20,50,100)
sample_num <- 1:10
#Create data.frame
expand_grid(sample_size,sample_num) %>%
#Map values to simulate cenarios
mutate(y = map(sample_size,sim_data)) %>%
unnest(cols = c(y)) %>%
ggplot(aes(sample = y))+
#qqplot
stat_qq()+
#qqline with color blue
stat_qq_line(col = "blue")+
#facet sample_size x sample_num
facet_grid(cols = vars(sample_size),rows = vars(sample_num))

How to estimate the density (empirical pdf) from quantiles (the empirical CDF) in R

Question
Say I have an unknown density a.
All I know is a probability grid (probs) of quantiles (quants).
How can I generate random samples from the unknown density?
This is what I have so far.
I am giving rejection sampling a try, but I am not tied to this method. Here I fit a polynomial (6 degress) to the quantiles. The purpose of this is to convert discrete quantiles to a smooth continuous function. This gives me an empirical CDF. I then use rejection sampling to get actual samples from the CDF. Is there a convenient way in R to convert samples from the CDF to density samples, or did I go about this in a convoluted way when there is a better alternative?
# unknown and probably not normal, but I use rnorm here because it is easy
a <- c(exp(rnorm(200, 5, .8)))
probs <- seq(0.05, 0.95, 0.05)
quants <- quantile(a, probs)
df_quants <- tibble::tibble(cum_probs, quants)
df_quants <- df_quants
fit <- lm(quants ~ poly(cum_probs, 6), df_quants)
df_quants$fit <- predict(fit, df_quants)
p <- df_quants %>%
ggplot(aes(x = cum_probs, y = quants))+
geom_line(aes(y = quants), color = "black", size = 1) +
geom_line(aes(y = fit), color = "red", size = 1)
CDF
count = 1
accept = c()
X <- runif(50000, 0, 1)
U <- runif(50000, 0, 1)
estimate <- function(x){
new_x <- predict(fit, data.frame(cum_probs = c(x)))
return(new_x)
while(count <= 50000 & length(accept) < 40000){
test_u = U[count]
test_x = estimate(X[count])/(1000*dunif(X[count], 0, 1))
if(test_u <= test_x){
accept = rbind(accept, X[count])
count = count + 1
}
count = count + 1
}
p2 <- as_tibble(accept, name = V1) %>%
ggplot(aes(x = V1)) +
geom_histogram(bins = 45)
}
CDF Samples
I don't think rejection sampling is needed, with a Bspline fit I was able to generate sensible samples via Inverse Transform, but I also needed a higher resolution grid. The tails are a little off.
The assumption I am making here is that a Bspline fit to a tight grid of quantiles approximate the inverse CDF function. Once this curve is fut I can just use random uniforms U[0,1]
library(splines2)
a <- c(exp(rnorm(200, 5, .8)))
cum_probs <- seq(0.01, 0.99, 0.01)
quants <- quantile(a, cum_probs)
df_quants <- tibble::tibble(cum_probs, quants)
fit_spline <- lm(quants ~ bSpline(cum_probs, df = 9), df_quants)
df_quants$fit_spline <- predict(fit_spline, df_quants)
estimate <- function(x){
new_x <- predict(fit_spline, data.frame(cum_probs = c(x)))
return(new_x)
}
e <- runif(10000, 0, 1)
y <-(estimate(e))
df_density <- tibble(y)
df_densitya <- tibble(a)
py <- df_density %>%
ggplot(aes(x = y)) +
geom_histogram()
pa <- df_densitya %>%
ggplot(aes(x = a)) +
geom_histogram(bins = 45)
original density
Inverse Transformation samples
summary stats
original dist a
Min. 1st Qu. Median Mean 3rd Qu. Max.
20.36 80.84 145.25 195.72 241.22 1285.24
generated from quantiles y
Min. 1st Qu. Median Mean 3rd Qu. Max.
28.09 81.78 149.53 189.07 239.62 667.27

Plotting a graph with sample sizes and power estimates

I have simulated a linear model 1000 times using a randomly generated height and weight values, and randomly assigned each participant to a treatment or non-treatment (factor of 1 and 0). Let's say the model was:
lm(bmi~height + weight + treatment, data = df)
I am now struggling for the following:
The model now needs to cycle through the sample sizes between 300 and 500 in steps of 10 for each of the 1000 replications and store the proportion of simulated experiments with p values less than 0.05 for the purpose of estimating the power that can detect a change of 0.5 in bmi between two treatment groups at 5% significance level.
After doing the above, I then need to create a figure that best depicts the sample sizes on x-axis, and the estimated power on the y-axis, and also reflect the smallest sample size to achieve a 80% power estimate by a distinct color.
Any ideas how and where to go from here?
Thanks,
Chris
I would do it something like this:
library(dplyr)
library(ggplot2)
# first, encapsulate the steps required to generate one sample of data
# at a given sample size, run the model, and extract the treatment p-value
do_simulate <- function(n) {
# use assumed data generating process to simulate data and add error
data <- tibble(height = rnorm(n, 69, 0.1),
weight = rnorm(n, 197.8, 1.9),
treatment = sample(c(0, 1), n, replace = TRUE),
error = rnorm(n, sd = 1.75),
bmi = 703 * weight / height^2 + 0.5 * treatment + error)
# model the data
mdl <- lm(bmi ~ height + weight + treatment, data = data)
# extract p-value for treatment effect
summary(mdl)[["coefficients"]]["treatment", "Pr(>|t|)"]
}
# second, wrap that single simulation in a replicate so that you can perform
# many simulations at a given sample size and estimate power as the proportion
# of simulations that achieve a significant p-value
simulate_power <- function(n, alpha = 0.05, r = 1000) {
p_values <- replicate(r, do_simulate(n))
power <- mean(p_values < alpha)
return(c(n, power))
}
# third, estimate power at each of your desired
# sample sizes and restructure that data for ggplot
mx <- vapply(seq(300, 500, 10), simulate_power, numeric(2))
plot_data <- tibble(n = mx[1, ],
power = mx[2, ])
# fourth, make a note of the minimum sample size to achieve your desired power
plot_data %>%
filter(power > 0.80) %>%
top_n(-1, n) %>%
pull(n) -> min_n
# finally, construct the plot
ggplot(plot_data, aes(x = n, y = power)) +
geom_smooth(method = "loess", se = FALSE) +
geom_vline(xintercept = min_n)

How to plot vector of bootstrapped slopes in ggplot2?

I've been using ggplot2 to plot the results of bootstrapping various statistical outputs such as correlation coefficients. Most recently, I bootstrapped the slope of a linear regression model. Here's how that looks using the plot() function from the graphics package:
plot(main="Relationship Between Eruption Length at Wait Time at \n
Old Faithful With Bootstrapped Regression Lines",
xlab = "Eruption Length (minutes)",
ylab = "Wait Time (minutes)",
waiting ~ eruptions,
data = faithful,
col = spot_color,
pch = 19)
index <- 1:nrow(faithful)
for (i in 1:10000) {
index_boot <- sample(index, replace = TRUE) #getting a boostrap sample (of indices)
faithful_boot <- faithful[index_boot, ]
# Fitting the linear model to the bootstrapped data:
fit.boot <- lm(waiting ~ eruptions, data = faithful_boot)
abline(fit.boot, lwd = 0.1, col = rgb(0, 0.1, 0.25, alpha = 0.05)) # Add line to plot
}
fit <- lm(waiting ~ eruptions, data=faithful)
abline(fit, lwd = 2.5, col = "blue")
That works, but it depends on a workflow where we first create a plot, then add the lines in a loop. I'd rather create a list of slopes with a function and then plot all of them in ggplot2.
For example, the function might look something like this:
set.seed(777) # included so the following output is reproducible
n_resample <- 10000 # set the number of times to resample the data
# First argument is the data; second is the number of resampled datasets
bootstrap <- function(df, n_resample) {
slope_resample <- matrix(NA, nrow = n_resample) # initialize vector
index <- 1:nrow(df) # create an index for supplied table
for (i in 1:n_resample) {
index_boot <- sample(index, replace = TRUE) # sample row numbers, with replacement
df_boot <- df[index_boot, ] # create a bootstrap sample from original data
a <- lm(waiting ~ eruptions, data=df_boot) # compute linear model
slope_resample[i] <- slope <- a$coefficients[2] # take the slope
}
return(slope_resample) # Return a vector of differences of proportion
}
bootstrapped_slopes <- bootstrap(faithful, 10000)
But how to get geom_line() or geom_smooth() to take the data from bootstrapped_slopes? Any assistance is much appreciated.
EDIT: More direct adaptation from the OP
For plotting, I presume you want both the slopes and the intercepts, so here's a modified bootstrap function:
bootstrap <- function(df, n_resample) {
# Note 2 dimensions here, for slope and intercept
slope_resample <- matrix(NA, 2, nrow = n_resample) # initialize vector
index <- 1:nrow(df) # create an index for supplied table
for (i in 1:n_resample) {
index_boot <- sample(index, replace = TRUE) # sample row numbers, with replacement
df_boot <- df[index_boot, ] # create a bootstrap sample from original data
a <- lm(waiting ~ eruptions, data=df_boot) # compute linear model
slope_resample[i, 1] <- slope <- a$coefficients[1] # take the slope
slope_resample[i, 2] <- intercept <- a$coefficients[2] # take the intercept
}
# Return a data frame with all the slopes and intercepts
return(as.data.frame(slope_resample))
}
Then run it and plot the lines from that data frame:
bootstrapped_slopes <- bootstrap(faithful, 10000)
library(dplyr); library(ggplot2)
ggplot(faithful, aes(eruptions, waiting)) +
geom_abline(data = bootstrapped_slopes %>%
sample_n(1000), # 10k lines look about the same as 1k, just darker and slower
aes(slope = V2, intercept = V1), #, group = id),
alpha = 0.01) +
geom_point(shape = 19, color = "red")
Alternative solution
This could also be done using modelr and broom to simplify some of the bootstrapping. Based on the main help example for modelr::bootstrap, we can do the following:
library(purrr); library(modelr); library(broom); library(dplyr)
set.seed(777)
# Creates bootstrap object with 10k extracts from faithful
boot <- modelr::bootstrap(faithful, 10000)
# Applies the linear regression to each
models <- map(boot$strap, ~ lm(waiting ~ eruptions, data = .))
# Extracts the model results into a tidy format
tidied <- map_df(models, broom::tidy, .id = "id")
# We just need the slope and intercept here
tidied_wide <- tidied %>% select(id, term, estimate) %>% spread(term, estimate)
ggplot(faithful, aes(eruptions, waiting)) +
geom_abline(data = tidied_wide %>%
sample_n(1000), # 10k lines look about the same as 1k, just darker and slower
aes(slope = eruptions, intercept = `(Intercept)`, group = id), alpha = 0.05) +
geom_point(shape = 19, color = "red") # spot_color wasn't provided in OP

Resources