I need to plot the statistical power vs. the number of replicates and in this case the number of replicates (n) is 3, but I can't figure out how to plot it.
This is what I have:
library(car)
n <- 3
nsims <- 1000
p = coef = vector()
for (i in 1:nsims) {
treat <- rnorm(n, mean = 460, sd = 110)
cont <- rnorm(n, mean = 415, sd = 110)
df <- data.frame(
y = c(treat, cont),
x = rep(c("treat", "cont"), each = n)
)
model <- glm(y ~ x, data = df)
p[i] = Anova(model)$P
coef[i] = coef(model)[2]
}
hist(p, col = 'skyblue')
sum(p < 0.05)/nsims
Can someone help me plot this?
Also, I need to calculate the mean of the coefficients using only models where p < 0.05. This is simulating the following process: if you perform the experiment, and p > 0.05, you report 'no effect’, but if p < 0.05 you report ‘significant effect’. But I'm not sure how to set that up from what I have.
Would I just do this?
mean(coef)
But I don't know how to include only those with p < 0.05.
Thank you!
Disclaimer: I spend a decent amount of time simulating experiments for work so I have strong opinions on this.
If that's everything because it's for a study assignment then fine, if you are planning to go further with this I recommend
adding the tidyverse to your arsenal.
Encapsulating functionality
First allows me to put a single iteration into a function to decouple its logic from the result subsetting (the encapsulation).
sim <- function(n) {
treat <- rnorm(n, 460, 110)
cont <- rnorm(n, 415, 110)
data <- data.frame(y = c(treat, cont), x = rep(c("treat", "cont"), each = n))
model <- glm(y ~ x, data = data)
p <- car::Anova(model)$P
coef <- coef(model)[2]
data.frame(n, p, coef)
}
Now we can simulate
nsims <- 1000
sims <- do.call(
rbind,
# We are now using the parameter as opposed to the previous post.
lapply(
rep(c(3, 5, 10, 20, 50, 100), each = nsims),
sim
)
)
# Aggregations
power_smry <- aggregate(p ~ n, sims, function(x) {mean(x < 0.05)})
coef_smry <- aggregate(coef ~ n, sims[sims$p < 0.05, ], mean)
# Plots
plot(p ~ n, data = power_smry
If you do this in the tidyverse this is one possible approach
crossing(
n = rep(c(3, 5, 10, 20, 50, 100))
# Add any number of other inputs here that you want to explore (like lift).
) %>%
rowwise() %>%
# This looks complicated but will be less so if you have multiple
# varying hyperparameters defined in crossing.
mutate(results = list(bind_rows(rerun(nsims, sim(n))))) %>%
pull(results) %>%
bind_rows() %>%
group_by(n) %>%
# The more metrics you want to summarize in different ways the easier compared to base.
summarize(
power = mean(p < 0.05),
coef = mean(coef[p < 0.05])
)
Related
The code I have so far is written below. This is simulation so there is no actual data. I have two questions:
I have two vectors (treat and cont) but I need to put them into one single vector which I did (vect), however, I need another vector that is coding for treatment vs. control. How do I do that?
For my model (model) I need to fit a linear model testing for a treatment effect but I don't know how to add that effect into what I have or is that what it is testing in the code I have?
library(car)
treat=rnorm(3, mean = 460, sd = 110)
treat
cont=rnorm(3, mean = 415, sd = 110)
cont
vect=c(treat, cont)
vect
nsims = 1000
p.value.saved = coeff.saved = vector()
for (i in 1:nsims) {
treat=rnorm(3, mean = 460, sd = 110)
cont=rnorm(3, mean = 415, sd = 110)
vect=c(treat, cont)
model = glm(treat ~ cont, family = poisson)
p.value.saved[i] = Anova(model)$P[1]
coeff.saved[i] = coef(model)
}
Thank you!
Something like this? (note that you'll get a bunch of warnings for running a poisson regression against continuous data.
n <- 3
nsims <- 10
do.call(
rbind,
lapply(1:nsims, function(.) {
treat <- rnorm(n, mean = 460, sd = 110)
cont <- rnorm(n, mean = 415, sd = 110)
# Instead of vect
df <- data.frame(
y = c(treat, cont),
x = rep(c("treat", "cont"), each = n)
)
# Model the values vs treatment indicator
model <- glm(y ~ x, data = df, family = poisson)
# Extract the model's p-value and coefficient of treatment.
data.frame(p = car::Anova(model)$P, coef = coef(model)[2])
})
)
The first creates the string and the second bit will combine them. In your example they are both length 3, hence the 3 repetition in rep("trt",3)
treat_lab = c(rep("control", 3),rep("trt", 3))
treatment <- cbind(treat_lab,c(treat,cont))
I want to create a comparison for normal test with Shapiro-Wilks, Kolmogorov-Smirnov, Anderson-Darling, Cramer von Mises dan Adjusted Jarque-Bera methods based on the power of test (1-beta) on sample sizes n = 10,20,30,40 and 50.
testnormal=function(n,m,alfa)
{
require(nortest)
require(normtest)
require(xlsx)
pvalue=matrix(0,m,5)
decision=matrix(0,m,5)
for (i in 1:m)
{
data=runif(n,2,5)
test1=shapiro.test(data)
pv1=test1$p.value
pvalue[i,1]=pv1
if (pv1<alfa)
{
decision[i,1]=1
}
test2=ks.test(data,"pnorm",mean=mean(data),sd=sd(data))
pv2=test2$p.value
pvalue[i,2]=pv2
if (pv2<alfa)
{
decision[i,2]=1
}
test3=ad.test(data)
pv3=test3$p.value
pvalue[i,3]=pv3
if (pv3<alfa)
{
decision[i,3]=1
}
test4=cvm.test(data)
pv4=test4$p.value
pvalue[i,4]=pv4
if (pv4<alfa)
{
decision[i,4]=1
}
test5=ajb.norm.test(data)
pv5=test5$p.value
pvalue[i,5]=pv5
if (pv2<alfa)
{
decision[i,5]=1
}
}
result1=data.frame(pvalue)
result2=data.frame(decision)
colnames(result1)=c("SW","KS","AD","CvM","AJB")
colnames(result2)=c("SW","KS","AD","CvM","AJB")
write.xlsx(result1,"testnormal_pvalue.xlsx")
write.xlsx(result2,"testnormal_decision.xlsx")
one_min_beta=t(1-(colSums(decision)/m))
test.of.power=data.frame(one_min_beta)
colnames(test.of.power)=c("SW","KS","AD","CvM","AJB")
return(test.of.power)
}
simulation=testnormal(10,100,0.05)
simulation2=testnormal(20,100,0.05)
simulation3=testnormal(30,100,0.05)
simulation4=testnormal(40,100,0.05)
simulation5=testnormal(50,100,0.05)
output=rbind(simulation,simulation2,simulation3,simulation4,simulation5)
output
I want to graph the power of the test to see trends in the up and down trend of the power of the test over the sample size, anyone can help please?
I went through your code and rewrote along the way to better understand what you want (what is the excel stuff for?). I have broken it down to smaller functions to allow you to have more control in these kinds of simulation studies. The code is not particularly efficient.
But does this give you what you want?
library("nortest")
library("normtest")
library("dplyr")
library("ggplot2")
# Function for doing all tests and putting it into a data.frame
tests <- function(data) {
list_of_tests <- list(
SW = shapiro.test(data),
KS = ks.test(data, pnorm, mean = mean(data), sd = sd(data)),
AD = ad.test(data) ,
CMV = cvm.test(data),
AJB = ajb.norm.test(data)
)
# Combine to tibble
res <- bind_rows(lapply(list_of_tests, unclass))
res[c("method", "p.value")] # Keep only method and p-value cols
}
# Test it with e.g. 'tests(data = runif(8, 2, 5))'
# Function for repeated simulation and testing, combine results and derive power
testnormal <- function(n, m, alpha) {
# Important that runif is inside replicate
test_res <-
bind_rows(replicate(tests(data = runif(n, 2, 5)), n = m,
simplify = FALSE))
test_of_powers <-
test_res %>%
group_by(method) %>%
summarize(power = mean(p.value < alpha)) %>%
mutate(n = n, m = m, alpha = alpha)
return(test_of_powers)
}
# Repeat over a number of simulations:
sims <- expand.grid(n = c(10, 20, 30, 40, 50),
m = 1000,
alpha = 0.05)
output <- bind_rows(
mapply(testnormal, n = sims$n, m = sims$m, alpha = sims$alpha,
SIMPLIFY = FALSE)
)
Actually doing the plot:
# Plot it
ggplot(output, aes(x = n, y = power, col = method)) +
geom_line()
This way should make it easier to plot as well as making simulations over other grids of values (e.g. varying alpha) or expand your range of n, etc.
I am using the following code to generate data, and i am estimating regression models across a list of variables (covar1 and covar2). I have also created confidence intervals for the coefficients and merged them together.
I have been examining all sorts of examples here and on other sites, but i can't seem to accomplish what i want. I want to stack the results for each covar into a single data frame, labeling each cluster of results by the covar it is attributable to (i.e., "covar1" and "covar2"). Here is the code for generating data and results using lapply:
##creating a fake dataset (N=1000, 500 at treated, 500 at control group)
#outcome variable
outcome <- c(rnorm(500, mean = 50, sd = 10), rnorm(500, mean = 70, sd = 10))
#running variable
running.var <- seq(0, 1, by = .0001)
running.var <- sample(running.var, size = 1000, replace = T)
##Put negative values for the running variable in the control group
running.var[1:500] <- -running.var[1:500]
#treatment indicator (just a binary variable indicating treated and control groups)
treat.ind <- c(rep(0,500), rep(1,500))
#create covariates
set.seed(123)
covar1 <- c(rnorm(500, mean = 50, sd = 10), rnorm(500, mean = 50, sd = 20))
covar2 <- c(rnorm(500, mean = 10, sd = 20), rnorm(500, mean = 10, sd = 30))
data <- data.frame(cbind(outcome, running.var, treat.ind, covar1, covar2))
data$treat.ind <- as.factor(data$treat.ind)
#Bundle the covariates names together
covars <- c("covar1", "covar2")
#loop over them using a convenient feature of the "as.formula" function
models <- lapply(covars, function(x){
regres <- lm(as.formula(paste(x," ~ running.var + treat.ind",sep = "")), data = d)
ci <-confint(regres, level=0.95)
regres_ci <- cbind(summary(regres)$coefficient, ci)
})
names(models) <- covars
print(models)
Any nudge in the right direction, or link to a post i just haven't come across, is greatly appreciated.
You can use do.call were de second argument is a list (like in here):
do.call(rbind, models)
I made a (possible) improve to your lapply function. This way you can save the estimated parameters and the variables in a data.frame:
models <- lapply(covars, function(x){
regres <- lm(as.formula(paste(x," ~ running.var + treat.ind",sep = "")), data = data)
ci <-confint(regres, level=0.95)
regres_ci <- data.frame(covar=x,param=rownames(summary(regres)$coefficient),
summary(regres)$coefficient, ci)
})
do.call(rbind,models)
A very small version of my problem goes like this:
I have a number of time series
library(data.table)
library(forecast)
library(tidyverse)
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
And I have modeled these with the fantastic forecast package, using auto.arima and data.table (in reality I have 400+ ts)
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
Which works wonders, my question is how do I update the Arima models for new data?
I have been trying to do something along the lines of
models <-setDT(data)[,list(model=list(Arima(value, model = models$model))), by = var]
But am having no luck!
I have a solution - but would love to know if there is a more R/data.table way to do this?
Note: As I was working to a solution, I changed the data to simulated ARIMA processes - to make sure the models were being updated correctly.
Solution:
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 200)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 200)
data_updated <- data.frame(x,y) %>% gather(var,value) # place updated data into data.frame
data_updated <- setDT(data_updated)[, list(dat=list(value)), by = var] # turn this into lists
#Use a loop to update the models
for(i in unique(models$var)){
models[var == paste0(i)][[1,2]] <- Arima(data_updated[var == paste0(i)][[1,2]] ,model = models[var == paste0(i)][[1,2]])
}
I have a function that currently plays nice with rgenoud. It has one parameter (xx) and rgenoud will optimize xx perfectly.
However, I would like to add a second parameter to my function that wouldnt be optimized by rgendoud . For example, I would like my function to either fit a model with a gaussian link or a poisson link and to specify that when I call rgenoud.
Any idea?
thanks
edit: here is a minimal working example of what I mean. How would you get the last line to work?
adstock reflect the fact that TV advertising should have an impact on the number of quotes of future weeks.
Adstock[t] = Ads[t] + rate* Ads[t-1] + rate^2*Ads[t-2] + .... + rate^max_memory * Ads[t-max_memory]
We want rgenoud to figure out what rate and max_memory will return the model with the best fit. Best fit is defined as the lowest RMSE.
set.seed(107)
library(fpp)
library(rgenoud)
adstock_k <- function(x, adstock_rate = 0, max_memory = 12){
learn_rates <- rep(adstock_rate, max_memory+1) ^ c(0:max_memory)
adstocked_advertising <- stats::filter(c(rep(0, max_memory), x), learn_rates, method="convolution")
adstocked_advertising <- adstocked_advertising[!is.na(adstocked_advertising)]
return(as.numeric(adstocked_advertising))
}
getRMSE <- function(x, y) {
mean((x-y)^2) %>% sqrt
}
df <- data.frame(insurance) %>%
mutate(Quotes = round (Quotes*1000, digits = 0 ))
df$idu <- as.numeric(rownames(df))
my_f <- function(xx){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
domaine <- cbind(c(30,1), c(85, 8))
#this works
min_f <- genoud(my_f, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
#here I try to add a second parameter to the function.
my_f2 <- function(xx,first_n_weeks=20){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
filter(idu<= first_n_weeks) %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
#this doesnt work
min_f2 <- genoud(my_f2(first_n_week=10), nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
Include the argument in the call to genoud, e.g.
genoud(my_f2, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T, first_n_weeks = 10)