I built a shiny web APP with a nice interface to calculate sample size needed to differentiate between test groups. I wanted to build a simulation that visualizes how power works, my understanding is that power is the probability of differentiating test groups from each other when in fact these are different. Using the built in power calculator I find that I need 10 replicates to distinguish 2 samples groups when %CV =15, %Improvement = 20 and power = 80%. When I try to simulate this using random number generators I on average get a power of 40-55%. See the code below.
cv <- 15 #coefficient of variance is the standard deviation divided by the mean
percent_increase <- 20 #percent increase to detect
mean1 <- 40
mean2 <- mean1 + (mean1*(percent_increase/100))
sd1 <- (cv*mean1)/100
sd2 <- (cv*mean2)/100
pooled_sd <- sqrt((sd1^2 + sd2^2)/2)
difference <- (percent_increase/100)*mean1
pwrt <- power.t.test(delta=difference,sd=sd1,power=0.8,sig.level = .05, type="two.sample", alternative = "two.sided")
print(paste("Number of replicates needed is", pwrt$n))
#Simulate sample populations and tag which samples are different from each other.
record_test <- c()
for(i in 1:1000){
rep_sequence <- seq(2,50, by=4)
data_list <- list() # empty data list
for(r in 1:length(rep_sequence)){
d <- rnorm(rep_sequence[r], mean = mean1, sd = pooled_sd)
d2 <- rnorm(rep_sequence[r], mean = mean2, sd = pooled_sd)
df <- data.frame("value"=d, "sample"="Sample 1")
df2 <- data.frame("value"=d2, "sample"="Sample 2")
df3 <- rbind(df, df2)
df3$rep_n <- rep_sequence[r]
data_list[[r]] <- df3
}
all_data <- do.call(rbind, data_list)
all_data_summ <- all_data %>%
group_by(sample, rep_n) %>%
summarise(N= n(),
mean = mean(value),
sd = sd(value)
) %>%
ungroup() %>%
mutate(se = sd / sqrt(N),
ci_lower = mean - qnorm(0.975)*se,
ci_upper = mean + qnorm(0.975)*se,
#ci_lower = mean - qt(1 - (0.05/2), N -1)* se,
#ci_upper = mean + qt(1 - (0.05/2), N -1)* se,
)
different_tag <- (all_data_summ %>% filter(sample == 'Sample 1') %>% select(N, ci_upper) ) %>% #Sample 1 set
left_join(all_data_summ %>% filter(sample == 'Sample 2') %>% select(N, ci_lower), by=c("N"="N")) %>% #Sample 2 set
rename(ci_upper_s1 = ci_upper, ci_lower_s2 = ci_lower) %>%
mutate(different = ifelse(ci_lower_s2 > ci_upper_s1 ,'yes', 'no'))
all_data_summ1 <- all_data_summ %>%
left_join(different_tag %>% select(N, different), by=c("N"="N"))
replicate_n <- 10 #at n=10 the power should be 80%
test_result <- all_data_summ1[all_data_summ1$rep_n == replicate_n, ]$different[[1]]
record_test <- c(record_test, test_result)
ggplot(all_data_summ1, aes(rep_n, mean)) +
geom_errorbar(aes(x=rep_n, ymin=ci_lower, ymax= ci_upper, group=sample),position=position_dodge(width=1.5), width=1.5,size=1, colour="red") +
geom_point(data= filter(all_data_summ1, different == "yes"), colour="black", size=8,stroke=2, aes(rep_n,mean,group=sample),position=position_dodge(width=1.5), shape=0) +
geom_point(position=position_dodge(width=1.5), size=4, pch=21, aes(fill=sample)) +
scale_x_continuous(breaks = rep_sequence) +
labs(x="Replication", y="Average", title= paste0('Sample Means with 95% Confidence Interval Bars at CV=',cv, '%')) +
theme_gray(base_size = 25) +
theme(plot.title = element_text(hjust = 0.5))
}
print(table(record_test)/length(record_test))
Related
I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))
set.seed(666999)
PLOT <- function(n){
n_samples<-10
#Creating matrix to store generated data
data_matrix<-matrix(ncol =n_samples, nrow= n)
for(j in 1:n_samples){
#Generating 10 standard normal samples of size n
data_matrix[ , j]<-rnorm(n=n, mean = 0, sd = 1)
}
for(k in 1:n_samples){
sam_20<-data_matrix[ , k]
# extracting each of the kth samples
#Ploting
Q_Qplot<-qqnorm( sam_20 )
Q_QplotL<-qqline( sam_20 )
#Q_Qplot<-gg_qqplot(sam_20, ylab="Sample Quantiles",
#xlab = "Theoritical Quantiles",
#main= bquote("Q-Q plot for sample size of "
# ~ n == ~ .(n)))#quantile-quantile plot
}
# return(n)
#return(Q_Qplot)
#return(Q_QplotL)
}
#layout_matrix_1 <- matrix(1:10, ncol = 5) # Define position matrix
#layout(layout_matrix_1)
PLOT(100)
PLOT(50)
PLOT(20)
PLOT(10)
For each sample size, I am generating ten plots. I want the layout for each of my 10 plots to fit well on a page with reasonable height and width. For some reason, I can't get it to work the way I wanted. I need some assistance perhaps with ggplot2 which has a better aesthetic view. Thanks
An approach with tidyverse
library(tidyverse)
#Values to simulate
sample_size <- c(10,20,50,100)
sample_num <- 1:10
#Create data.frame
expand_grid(sample_size,sample_num) %>%
#Map values to simulate cenarios
mutate(y = map(sample_size,sim_data)) %>%
unnest(cols = c(y)) %>%
ggplot(aes(sample = y))+
#qqplot
stat_qq()+
#qqline with color blue
stat_qq_line(col = "blue")+
#facet sample_size x sample_num
facet_grid(cols = vars(sample_size),rows = vars(sample_num))
I was wondering why variable mean_y is not recognized by my
geom_hline(yintercept = unique(mean_y)) call?
library(tidyverse)
set.seed(20)
n_groups <- 2
n_in_group <- 20
sd_e = 2
groups <- gl(n_groups, n_in_group, labels = c("T","C"))
age <-rnorm(length(groups), 25, 3)
betas <- c(5,0,0,2)
dat <- data.frame(groups=groups,age=age)
X <- model.matrix(~ groups * age, data = dat)
lin_pred <- as.vector(X %*% betas)
dat$y <- rnorm(nrow(X), lin_pred, sd_e)
dat %>% group_by(groups) %>% mutate(mean_y = mean(y)) %>%
ungroup() %>%
ggplot()+aes(x = age, y = y) +
geom_point(aes(color=groups)) +
geom_hline(yintercept = unique(mean_y)) # Error in unique(mean_y) :
# object 'mean_y' not found
Variables need to be inside aes(), try:
geom_hline(aes(yintercept = mean_y))
I would like to calculate the percentiles of the following tibble...
I have a non-zero subset of 10 observations in each of 3 variables i.e...
n <- 10
tibb <- tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y)
(The excluded observations are all zero)
Therefore the mean is the sum of the fields / 10 (as opposed to / 5):
meantibb <- tibb %>% group_by() %>%
summarise_if(is.numeric, sum, na.rm = TRUE) / n
meantibb
How do I get the following percentiles of x, y and z in the tibble please?
perciles <- c(0.5, 0.75)
percentiles <- function(p) quantile(p, perciles)
Thank you
You could create a data set including the zeroes
missingRowCount <- n - nrow(tibb)
colCount <- ncol(tibb)
zeroTibb <- matrix(rep(0, missingRowCount * colCount), ncol = colCount, nrow = missingRowCount) %>% as.tibble()
colnames(zeroTibb) <- colnames(tibb)
allTibb <- dplyr::bind_rows(tibb, zeroTibb)
Once you have the full data you can run the following to get a tibble of percentiles
percTibble = sapply(allTibb, percentiles) %>%
as.tibble()
The assumption here is that the data is not going to be too large when the zeroes are included.
You're close, your method of creating the mean (and subsequently the percentiles) could be simpler if you use gather first and then group the data by the three different factors.
library(dplyr)
n <- 10
tibb <- tibble(x = 1:5, y = 1, z = x ^ 2 + y)
tibb %>%
gather("fctr", "value") %>%
group_by(fctr) %>%
summarise(mean = sum(value) / n,
perc_50 = quantile(value, 0.5),
perc_75 = quantile(value, 0.75))
However, I'm not sure if you want the percentile of the non-zero subset or the entire dataset, because this will change your outcomes, i.e.
> x = 1:5
> quantile(x, 0.1)
10%
1.4
> test <- c(0,0,0,0,0,1,2,3,4,5)
> quantile(test, 0.1)
10%
0
Using the dataset Lahman::Batting I've estimated parameters for the beta distribution. Now I want to plot this empirically derived beta distribution onto the histogram that I estimated it from.
library(dplyr)
library(tidyr)
library(Lahman)
career <- Batting %>%
filter(AB > 0) %>%
anti_join(Pitching, by = "playerID") %>%
group_by(playerID) %>%
summarize(H = sum(H), AB = sum(AB)) %>%
mutate(average = H / AB)
I can plot the distribution of RBI as:
career %>%
filter(AB > 500) %>%
ggplot(aes(x = average)) +
geom_histogram() +
geom_freqpoly(color = "red")
And obtain:
I know I can use + geom_freqpoly to obtain:
But I want the smooth beta distribution. I can estimate beta parameters by:
career_filtered <- career %>%
filter(AB >= 500)
m <- MASS::fitdistr(career_filtered$average, dbeta,
start = list(shape1 = 1, shape2 = 10))
alpha0 <- m$estimate[1] # parameter 1
beta0 <- m$estimate[2] # parameter 2
Now that I have parameters alpha0 and beta0, how do I plot the beta distribution so that I obtain something like this:
This question is based on a post I'm reading here.
All code, including the code for the plots, can be found here. The following code is used to get the requested plot:
ggplot(career_filtered) +
geom_histogram(aes(average, y = ..density..), binwidth = .005) +
stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
size = 1) +
xlab("Batting average")
Hope this helps.