Plotting power vs. effect size using R pwr package - r

I can successfully create plots of power vs. sample size in R using the pwr package. Example code below.
library(pwr)
library(tidyverse)
plot.out <- pwr.t2n.test(n1=30, n2=30, d=0.5, alternative="two.sided")
#See output in link below
plot(plot.out)
plot() output
I would like to create a similar plot -- a two-sample t-test in which effect size is on the y-axis and power is on the x-axis, with fixed sample sizes.
Is there a way to do this using pwr and/or the plot function? Or would I have to unlist the plot.out object and use it somehow?
I'm still new to power curves in R. Thanks in advance for any advice.

In the code below the power is computed in a loop on effect size d_seq. Then the power d is extracted from the results list, a data.frame is created and plotted.
library(pwr)
library(ggplot2)
d_seq <- seq(0, 2, by = 0.1)
pwr_list <- lapply(d_seq, function(d){
pwr.t2n.test(n1 = 30, n2 = 30,
d = d,
power = NULL,
sig.level = 0.05,
alternative = "two.sided")
})
pwr <- sapply(pwr_list, '[[', 'power')
dfpwr <- data.frame(power = pwr, effect.size = d_seq)
ggplot(dfpwr, aes(effect.size, power)) +
geom_point(size = 2, colour = "black") +
geom_line(size = 0.5, colour = "red") +
scale_y_continuous(labels = scales::percent) +
xlab("effect size") +
ylab(expression("test power =" ~ 1 - beta))
To draw a line where power is 80% and get the effect size, first compute the effect size from the pwr vector by linear interpolation.
pwr80 <- approx(x = pwr, y = d_seq, xout = 0.8)
Now create a label for geom_text and plot it.
lbl80 <- paste("Power = 80%\n")
lbl80 <- paste(lbl80, "Effect size =", round(pwr80$y, 2))
ggplot(dfpwr, aes(effect.size, power)) +
geom_point(size = 2, colour = "black") +
geom_line(size = 0.5, colour = "red") +
geom_hline(yintercept = 0.8, linetype = "dotted") +
geom_text(x = pwr80$y, y = pwr80$x,
label = lbl80,
hjust = 1, vjust = -1) +
scale_y_continuous(labels = scales::percent) +
xlab("effect size") +
ylab(expression("test power =" ~ 1 - beta))
To also draw a vertical line, add
geom_vline(xintercept = pwr80$y, linetype = "dotted")

Related

Why is fullrange=TRUE not working for geom_smooth in ggplot2?

I have a plot where I am plotting both the linear regressions for each level of a variable as well as the linear regression for the total sample.
library(ggplot2);library(curl)
df<-read.csv(curl("https://raw.githubusercontent.com/megaraptor1/mydata/main/example.csv"))df$group<-as.factor(df$group)
ggplot(df,aes(x,y))+
geom_point(size=2.5,shape=21,aes(fill=group),col="black")+
geom_smooth(formula=y~x,aes(col=group,group=group),method="lm",size=1,se=F)+
geom_smooth(formula=y~x,method="lm",col="black",size=1,fullrange=T,se=F)+
theme_classic()+
theme(legend.position = "none")
I am trying to extend the black line (which represents all specimens) to span the full range of the axes using the command fullrange=T. However, I have found the command fullrange=T is not working on this graph regardless of what I try. This is especially strange as I have not called any limits for the graph or set any additional global factors.
This question was the closest I was able to find to my current problem, but it does not appear to be describing the same issue because that issue had to do with how the limits of the graph were called.
This seems a bit heavy handed but allows you to extent your regression line to whatever limits you choose for the x axis.
The argument fullrange is not really documented very helpfully. If you have a look at http://www.mosaic-web.org/ggformula/reference/gf_smooth.html it appears that "fullrange" applies to the points in the dataframe that is used to generate the regression line. So in your case your regression line is extending to the "fullrange". It's just that your definition of "fullrange" is not quite the same as that used by geom_smooth.
library(ggplot2)
library(dplyr)
library(curl)
lm_formula <- lm(formula = y~x, data = df)
f_lm <- function(x){lm_formula$coefficients[1] + lm_formula$coefficients[2] * x}
df_lim <-
data.frame(x = c(0, 5)) %>%
mutate(y = f_lm(x))
ggplot(df,aes(x,y))+
geom_point(size=2.5,shape=21,aes(fill=group),col="black")+
geom_smooth(formula=y~x,aes(col=group,group=group),method="lm",size=1,se=F)+
geom_line(data = df_lim)+
coord_cartesian(xlim = df_lim$x, ylim = df_lim$y, expand = expansion(mult = 0))+
theme_classic()+
theme(legend.position = "none")
data
df<-read.csv(curl("https://raw.githubusercontent.com/megaraptor1/mydata/main/example.csv"))
df$group<-as.factor(df$group)
Created on 2021-04-05 by the reprex package (v1.0.0)
I had the same issue. Despite setting fullrange = TRUE, the line of best fit was only being drawn in the data range.
ggplot(data = df, aes(x = diameter, y = height)) +
geom_point(size = 2) +
geom_smooth(method = lm, se = FALSE, fullrange = TRUE) +
labs(x = "Diameter", y = "Height", title = "Tree Height vs. Diameter") +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
Bad plot: 1
Using scale_x_continuous() and scale_y_continuous() worked for me (thank you #markus). I added two lines of code, below geom_smooth(), to fix the issue.
ggplot(data = df, aes(x = diameter, y = height)) +
geom_point(size = 2) +
geom_smooth(method = lm, se = FALSE, fullrange = TRUE) +
scale_x_continuous(expand = c(0,0), limits=c(5, 32)) + #expand = c(num1,num2) => line of best fit stops being drawn at x = 32 + (32 - 5)*num1 + num2 = 32 + (32 - 5)*0 + 0 = 32
scale_y_continuous(expand = c(0,0), limits=c(7, 25)) + #expand = c(num1,num2) => line of best fit stops being drawn at y = 25 + (25 - 7)*num1 + num2 = 25 + (25 - 7)*0 + 0 = 25
labs(x = "Diameter", y = "Height", title = "Tree Height vs. Diameter") +
theme(plot.title = element_text(hjust = 0.5, size = 15, face = 'bold'))
Good plot: 2
Source: How does ggplot scale_continuous expand argument work?

Error when using multiple datasets to plot polygon annotation on ggplot2

I am creating a forest plot for a meta-analysis using ggplot2. I want to manually add a skewed diamond shape (asymmetric on the y-scale) to represent an effect size and confidence interval.
I can draw the forest plot and add four segments to create the diamond but this doesn't give a nice clear, sharp diamond. Instead I've used geom_polygon with a set of co-ordinates in a second dataframe. When I try to write to pdf I receive the following error
summarydiamond <- data.frame(
x = c(sleepstress.r.CI.L, sleepstress.r.estimate, sleepstress.r.CI.U, sleepstress.r.estimate, sleepstress.r.CI.L),
y = c(-1, -1.5, -1, -0.5, -1)
)
forest.plot <-
dat.sleepstress %>%
ggplot(aes(x = rev(key.pairing), y = r, ymin = r.CI.lower, ymax = r.CI.upper))+
geom_errorbar(width = 0.5) +
geom_point(aes(size = r.weights)) +
scale_size(range = c(1, 7)) +
geom_hline(yintercept = 0) +
theme_minimal() +
coord_flip() +
theme(legend.position = "none") +
labs(x = "", y = "Correlation coefficient") +
theme(text = element_text(size=14)) +
scale_x_discrete(limits=rev) +
geom_text(aes(label = paste0(format(round(r, 2),nsmall = 2),
" (",
format(round(r.CI.lower, 2),nsmall = 2),
", ",
format(round(r.CI.upper, 2),nsmall = 2),
")"),
y = 0.85),
hjust="inward") +
geom_polygon(aes(x=x, y=y), data = summarydiamond)
pdf(file = 'forestplot.pdf', width = 10, height = 10)
forest.plot
dev.off()
Output:
forest.plot
Error in FUN(X[[i]], ...) : object 'r.CI.lower' not found
I've tried adding the data= argument to all of the geom_ calls but this doesn't fix it.

How can I make a density scatterplot with log scale in R?

I'd like to make a density scatterplot with log10 scale in R. I tried to plot it using ggplot and stat_density2d in R. I used this code:
ggplot(data=vod_agb_df, aes(vod, agb)) +
stat_density2d(aes(fill = ..density..), geom = "tile", contour = FALSE, n = 100) +
scale_fill_distiller(palette = 'YlOrRd', direction = 1) +
scale_x_continuous(breaks=seq(0, 1, 0.25), limits = c(0, 1)) +
scale_y_continuous(breaks=seq(0, 300, 50), limits = c(0, 300)) +
labs(x='L-VOD', y='AGB(Mg/ha)') +
theme_bw()
But the result looks strange. the density scatterplot with my code
This is the plot I want to plot
The original scatterplot
You can log10-transform the density; here's a minimal & reproducible example
library(MASS)
library(tidyverse)
set.seed(2020)
mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2)) %>%
as_tibble() %>%
ggplot(aes(V1, V2)) +
stat_density2d(
aes(fill = log10(..density..)), geom = "tile", contour = FALSE, n = 100) +
scale_fill_distiller(palette = 'YlOrRd', direction = 1) +
theme_bw()
Update
It's not clear to me what you mean by ""I'd like to make the density scatterplot in the point distributed area, not the whole area of the plot."" If you're asking how to increase the height of the gradient colour bar, you can do the following
set.seed(2020)
mvrnorm(100, mu = c(0, 0), Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2)) %>%
as_tibble() %>%
ggplot(aes(V1, V2)) +
stat_density2d(
aes(fill = log10(..density..)), geom = "tile", contour = FALSE, n = 100) +
scale_fill_distiller(palette = 'YlOrRd', direction = 1) +
theme_bw() +
guides(fill = guide_colorbar(barheight = unit(3.5, "in"), title.position = "right"))
Whatever plot you are showing as your expected output for that you can use following code
library(tidyverse)
# Bin size control + color palette
ggplot(iris, aes(x=Sepal.Length, y=Petal.Length) ) +
geom_bin2d(bins = 20) +
scale_fill_distiller(palette = 'YlOrRd', direction = 1) +
theme_bw() +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

How to plot vectors from capscale analysis in ggplot2?

I have generated a capscale model of presence-absence community composition data using capscale in vegan. I am able to use the following code to produce a plot using base R
(SOR.capscale <- capscale(SOR ~ df.meta.clin.week.snake$copy.num + df.meta.clin.week.snake$exp.time, df.otus.clin.week.snake.nonzero)) #specify model
plot(SOR.capscale, main = "Capscale Analysis", xlab = "CAP1", ylab = "CAP2")
points(SOR.capscale, col = expgroup)
ordihull(SOR.capscale, expgroup, col = c("black", "red"), label = FALSE, display = "sites")
This generates the following visualization
I have used the following code to extract the CAP values from the capscale model to plot them in ggplot
x <- as.data.frame(scores(SOR.capscale, display = "sites"))
df.pred.clin.week.snake$CAP1 <- x$CAP1
df.pred.clin.week.snake$CAP2 <- x$CAP2
I have used this data to generate a plot in ggplot with the listed code
ggplot(df.pred.clin.week.snake, aes(x= CAP1, y= CAP2, color = expgroup)) +
stat_ellipse(aes(fill = expgroup), geom = "polygon", alpha = 0.2) +
geom_point() +
theme_classic() +
coord_cartesian(xlim=c(-20, 20), ylim=c(-6.5, 15)) +
geom_hline(yintercept = 0, linetype="dotted") +
geom_vline(xintercept = 0, linetype="dotted") +
labs(color = "Experimental Treatment", fill = "Experimental Treatment") +
ggtitle("Capscale Analysis") +
theme(plot.title = element_text(hjust = 0.5))
ylab("CAP2") +
xlab("CAP1")
My question is how would I extract the vector information from the capscale analysis model in such a way that I am then be able to plot them using ggplot? Thank you greatly!
In capscale the information is there to tell you where the arrows come from , generally SOR.capscale$CCA$biplot - whatever those numbers are you can plug them in:
plot + geom_segment(aes(x = 0.0, y =0.0,xend = 10.9 , yend = -0.01), arrow =arrow())+ geom_segment(aes(x = 0.0, y =0.0, xend = 0.01, yend = 10.1), arrow =arrow())
or whatever your actual numbers are.

Correlation loading plot from PLSR with observations using ggplot2

I am investigating the correlation between sensory data and chemical measurements using PLS regression from the pls package. Ultimately, I want to display the results in a correlation loading plot as illustrated by the example below. So far I managed to make the plot with X and Y correlation matrices but I haven't figured out a way to project the observations on the plot.
As an example, I use the oliveoil data set from the pls package. I computed the correlation loadings (using the method described here) and created a correlation plot using ggplot2 (This can be done in a simple manner using the plsdepot package but I like the versatility of ggplot):
library(pls)
data("oliveoil")
oil <- plsr(sensory ~ chemical, scale = TRUE, data = oliveoil)
scores <- oil$scores
sc1 <- scores[,1]
sc2 <- scores[,2]
scores <- as.data.frame(cbind(sc1, sc2))
cl_plsr <- cor(model.matrix(oil), scores)
df_cor <- as.data.frame(cl_plsr)
df_depend_cor <- as.data.frame(cor(oliveoil$sensory, scores))
plot_loading_correlation <- rbind(df_cor, df_depend_cor)
plot_loading_correlation1 <- setNames(plot_loading_correlation, c("comp1", "comp2"))
#Function to draw circle
circleFun <- function(center = c(0,0),diameter = 1, npoints = 100){
r = diameter / 2
tt <- seq(0,2*pi,length.out = npoints)
xx <- center[1] + r * cos(tt)
yy <- center[2] + r * sin(tt)
return(data.frame(x = xx, y = yy))
}
dat_plsr <- circleFun(c(0,0),2,npoints = 100)
library(ggplot2)
library(ggrepel)
p <- ggplot(data=plot_loading_correlation1, aes(comp1, comp2))+
theme_bw() +
geom_hline(aes(yintercept = 0), size=.2, linetype = 3)+
geom_vline(aes(xintercept = 0), size=.2, linetype = 3)+
geom_text_repel(aes(label = rownames(plot_loading_correlation1),
colour = c("black","black","black","black","black",
"red","red","red","red","red","red")))+
scale_color_manual(values=c("blue","red"))+
scale_x_continuous(breaks = seq(-1,2.5, by=0.5))+
scale_y_continuous(breaks = seq(-1.5,2.5, by=0.5))+
coord_fixed(ylim=c(-1, 1), xlim=c(-1, 1)) + xlab("PC 1") + ylab("PC 2")+
geom_path(data=dat_plsr ,
aes(x,y), colour = "darkgrey")+
theme(legend.title=element_blank())+
theme(axis.ticks = element_line(colour = "black"))+
theme(axis.title = element_text(colour = "black"))+
theme(axis.text = element_text(color="black"))+
theme(legend.position='none')+
theme(panel.grid.minor = element_blank()) +
theme(panel.grid.major = element_blank()) +
geom_point(data = plot_loading_correlation1,
aes(x=comp1, y=comp2),
colour = c("blue","blue","blue","blue","blue",
"red","red","red","red","red","red"),
shape = c(21,21,21,21,21,22,22,22,22,22,22),
fill = c("blue","blue","blue","blue","blue",
"red","red","red","red","red","red"),
size = 2.2)
p
How can I project individual observations to that plot as illustrated in the example above? Should the scores be scaled so that they fit on the correlation loadings scale (from -1 to 1)? And is that acceptable scientifically?

Resources