put correlation coefficient on ggplot scatter plot after faceting - r

I'm having issue to put correlation coefficient on my scatter plot after facet_wrap by another variable.
Below is the example I made using mtcars dataset for illustration purpose.
when I plot it out, both plot have the same correlation number. It seems the correlation coef is not calculated for each facet. I could not figure out a way to achieve that. Really appreciate it if anyone could kindly help with that...
library(ggplot2)
library(dplyr)
corr_eqn <- function(x,y, method='pearson', digits = 2) {
corr_coef <- round(cor.test(x, y, method=method)$estimate, digits = digits)
corr_pval <- tryCatch(format(cor.test(x,y, method=method)$p.value,
scientific=TRUE),
error=function(e) NA)
paste(method, 'r = ', corr_coef, ',', 'pval =', corr_pval)
}
sca.plot <- function (cor.coef=TRUE) {
df<- mtcars %>% filter(vs==1)
p<- df %>%
ggplot(aes(x=hp, y=mpg))+
geom_point()+
geom_smooth()+
facet_wrap(~cyl, ncol=3)
if (cor.coef) {
p<- p+geom_text(x=0.9*max(df$hp, na.rm=TRUE),
y=0.9*max(df$mpg, na.rm=TRUE),
label = corr_eqn(df[['hp']],df[['mpg']],
method='pearson'))
}
return (p)
}
sca.plot(cor.coef=TRUE)

Call facets through variable inputFacet, loop over this variable to calculate corr_enq and plot facets using variable name with get.
In shiny you'll probably have user input as input$facet here it's called inputFacet. We plot main plot getting this variable in facet_wrap(~ get(inputFacet), ncol = 3). Next we loop over all facet options with for(i in seq_along(resCor$facets)) and store result in rescore.
This should solve "correlation coef is not calculated for each facet" problem.
library(dplyr)
library(ggplot2)
inputFacet <- "cyl"
cor.coef = TRUE
df <- mtcars
p <- df %>%
ggplot(aes(hp, mpg))+
geom_point()+
geom_smooth()+
facet_wrap(~ get(inputFacet), ncol = 3)
if (cor.coef) {
resCor <- data.frame(facets = unique(mtcars[, inputFacet]))
for(i in seq_along(resCor$facets)) {
foo <- mtcars[mtcars[, inputFacet] == resCor$facets[i], ]
resCor$text[i] <- corr_eqn(foo$hp, foo$mpg)
}
colnames(resCor)[1] <- inputFacet
p <- p + geom_text(data = resCor,
aes(0.9 * max(df$hp, na.rm = TRUE),
0.9 * max(df$mpg, na.rm = TRUE),
label = text))
}
p

Related

Changing aesthetics in ggplot generated by svars package in R

I'm using the svars package to generate some IRF plots. The plots are rendered using ggplot2, however I need some help with changing some of the aesthetics.
Is there any way I can change the fill and alpha of the shaded confidence bands, as well as the color of the solid line? I know in ggplot2 you can pass fill and alpha arguments to geom_ribbon (and col to geom_line), just unsure of how to do the same within the plot function of this package's source code.
# Load Dataset and packages
library(tidyverse)
library(svars)
data(USA)
# Create SVAR Model
var.model <- vars::VAR(USA, lag.max = 10, ic = "AIC" )
svar.model <- id.chol(var.model)
# Wild Bootstrap
cores <- parallel::detectCores() - 1
boot.svar <- wild.boot(svar.model, n.ahead = 30, nboot = 500, nc = cores)
# Plot the IRFs
plot(boot.svar)
I'm also looking at the command for a historical decomposition plot (see below). Is there any way I could omit the first two facets and plot only the bottom three lines on the same facet?
hist.decomp <- hd(svar.model, series = 1)
plot(hist.decomp)
Your first desired result is easily achieved by resetting the aes_params after calling plot. For your second goal. There is probably an approach to manipulate the ggplot object. Instead my approach below constructs the plot from scratch. Basically I copy and pasted the data wrangling code from vars:::plot.hd and filtered the prepared dataset for the desired series:
# Plot the IRFs
p <- plot(boot.svar)
p$layers[[1]]$aes_params$fill <- "pink"
p$layers[[1]]$aes_params$alpha <- .5
p$layers[[2]]$aes_params$colour <- "green"
p
# Helper to convert to long dataframe. Source: svars:::plot.hd
hd2PlotData <- function(x) {
PlotData <- as.data.frame(x$hidec)
if (inherits(x$hidec, "ts")) {
tsStructure = attr(x$hidec, which = "tsp")
PlotData$Index <- seq(from = tsStructure[1], to = tsStructure[2],
by = 1/tsStructure[3])
PlotData$Index <- as.Date(yearmon(PlotData$Index))
}
else {
PlotData$Index <- 1:nrow(PlotData)
PlotData$V1 <- NULL
}
dat <- reshape2::melt(PlotData, id = "Index")
dat
}
hist.decomp <- hd(svar.model, series = 1)
dat <- hd2PlotData(hist.decomp)
dat %>%
filter(grepl("^Cum", variable)) %>%
ggplot(aes(x = Index, y = value, color = variable)) +
geom_line() +
xlab("Time") +
theme_bw()
EDIT One approach to change the facet labels is via a custom labeller function. For a different approach which changes the facet labels via the data see here:
myvec <- LETTERS[1:9]
mylabel <- function(labels, multi_line = TRUE) {
data.frame(variable = labels)
}
p + facet_wrap(~variable, labeller = my_labeller(my_labels))

trouble adding SMA fit line to ggplot - geom_segment() nor geom_abline() don't match

I am trying to add the trendline from an SMA (standardized major axis) fit to my ggplot. However, when I extract the coefficients from the SMA and give them to geom_abline() the line extends over the entire plot instead of clipping to the data. The natural solution to this would be use a geom_segment() instead, manually calculating the endpoints of the line. However, when I do this the lines don't match each other and neither match the SMA fit. What's going on here?
I am aware that you can use the plot function directly on an sma object but I would prefer to use ggplot
Note: this is my first time asking a question so my apologies if I'm missing something!
Edit: I am using a log-log axis, which I suspect may be part of the issue.
Reproducible version below:
library(tidyverse)
library(smatr) #for the SMA
# sample data set
x <- rlnorm(100, meanlog = 10)
var <- rlnorm(100, meanlog = 10)
df <- data.frame(x=x, y=x+var)
# fit using an SMA
sm <- sma(x~y, data = df, log = "xy")
# get sma coefficients into a data.frame
bb <- data.frame(coef(sm))
bb <- bb %>%
rownames_to_column(var = "Coef") %>%
pivot_wider(names_from = "Coef", values_from = "coef.sm.")
## calculate end coordinates for segment
bb$min_x <- min(df$x, na.rm = TRUE)
bb$max_x <- max(df$x, na.rm = TRUE)
bb <- bb %>%
mutate(min_y = (slope*min_x) + elevation) %>%
mutate(max_y = (slope*max_x) + elevation)
# plot into ggplot
p1 <- ggplot(df, aes(x=x, y=y)) +
geom_point(shape=21) +
scale_y_continuous(trans = 'log10')+
scale_x_continuous(trans = 'log10') +
geom_abline(data=bb,aes(intercept=elevation,slope=slope), color = "blue")
p1 + geom_segment(data=bb, aes(x=min_x, xend=max_x, y=min_y, yend=max_y), color = "orange")
#this is the plot from the smatr package for comparison
plot(sm)

Separated Boxplots for each column of a dataset

I have a data frame with 79 columns.
For each column, I am trying to produce an entirely separated boxplot.
I have tried
apply(integers, 2,function(x) boxplot(x, main = colnames(integers["x"])))
However, I cannot add the title of each column to the respective boxplot.
library(tidyverse)
plot_function <- function(column_name, data_in) {
plot_out <- ggplot(data_in, aes_string(y = column_name)) +
geom_boxplot() +
labs(title = column_name)
return(plot_out)
}
plot_columns <- names(iris)[1:4]
plot <- lapply(plot_columns, function(x, y) plot_function(x, y), y = iris)
plot[[1]]

How to change type of line in prophet plot?

Facebook's Prophet in R (there's also a Python version) is used to generate time series forecasts.
A model m is created by:
m <- prophet(df)
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
plot(m, forecast)
Which returns a very nicely formatted graph, like:
I would like to change the line type, to get not dots but a usual thin line.
I had tried this
lines(m$history$y,lty=1)
but got an error
In doTryCatch(return(expr), name, parentenv, handler)
Are there are any suggestions how to convert those dots into a line?
The plot method for prophet objects uses ggplot2, so base R graphics functions like lines() won't work. You can use ggplot2::geom_line() to add lines, but at the moment I don't see an easy way to replace the points by lines ...
Example from ?prophet:
history <- data.frame(ds = seq(as.Date('2015-01-01'), as.Date('2016-01-01'), by = 'd'),
y = sin(1:366/200) + rnorm(366)/10)
m <- prophet(history)
future <- make_future_dataframe(m, periods = 365)
forecast <- predict(m, future)
pp <- plot(m,forecast)
Add lines:
library(ggplot2)
pp + geom_line()
This question provides a (hacky) way forward:
pp2 <- pp + geom_line()
qq2 <- ggplot_build(pp2)
qq2$data[[2]]$colour <- NA
plot(ggplot_gtable(qq2))
But obviously something went wrong with the hack. The better bet would be to look at the plot method(prophet:::plot.prophet) and modify it to behave as you want ... Here is the bare-bones version:
df <- prophet:::df_for_plotting(m, forecast)
gg <-ggplot(df, aes(x = ds, y = y)) + labs(x = "ds", y = "y")
gg <- gg + geom_ribbon(ggplot2::aes(ymin = yhat_lower,
ymax = yhat_upper), alpha = 0.2, fill = "#0072B2",
na.rm = TRUE)
## replace first geom_point() with geom_line() in next line ...
gg <- gg + geom_line(na.rm = TRUE) + geom_line(aes(y = yhat),
color = "#0072B2", na.rm = TRUE) + theme(aspect.ratio = 3/5)
I may have stripped out some components that exist in your data/forecast, though ...
it is possible to make such manipulations with dyplot.prophet(m, forecast) (html version of plot) :) before that, we should rewrite function like here:
dyplot.prophet <- function(x, fcst, uncertainty=TRUE,
...)
{
forecast.label='Predicted'
actual.label='Actual'
# create data.frame for plotting
df <- prophet:::df_for_plotting(x, fcst)
# build variables to include, or not, the uncertainty data
if(uncertainty && exists("yhat_lower", where = df))
{
colsToKeep <- c('y', 'yhat', 'yhat_lower', 'yhat_upper')
forecastCols <- c('yhat_lower', 'yhat', 'yhat_upper')
} else
{
colsToKeep <- c('y', 'yhat')
forecastCols <- c('yhat')
}
# convert to xts for easier date handling by dygraph
dfTS <- xts::xts(df %>% dplyr::select_(.dots=colsToKeep), order.by = df$ds)
# base plot
dyBase <- dygraphs::dygraph(dfTS)
presAnnotation <- function(dygraph, x, text) {
dygraph %>%
dygraphs::dyAnnotation(x, text, text, attachAtBottom = TRUE)
}
dyBase <- dyBase %>%
# plot actual values
dygraphs::dySeries(
'y', label=actual.label, color='black',stepPlot = TRUE, strokeWidth=1
) %>%
# plot forecast and ribbon
dygraphs::dySeries(forecastCols, label=forecast.label, color='blue') %>%
# allow zooming
dygraphs::dyRangeSelector() %>%
# make unzoom button
dygraphs::dyUnzoom()
if (!is.null(x$holidays)) {
for (i in 1:nrow(x$holidays)) {
# make a gray line
dyBase <- dyBase %>% dygraphs::dyEvent(
x$holidays$ds[i],color = "rgb(200,200,200)", strokePattern = "solid")
dyBase <- dyBase %>% dygraphs::dyAnnotation(
x$holidays$ds[i], x$holidays$holiday[i], x$holidays$holiday[i],
attachAtBottom = TRUE)
}
}
return(dyBase)
}
the strokeWidth=0 was before and we have changed it to strokeWidth=1 and added stepPlot = TRUE
the whole basis code is situated here: https://rdrr.io/cran/prophet/src/R/plot.R

ggplot2 boxplots - How to avoid extra vertical space when there are no significant comparisons?

After many questions on how to make boxplots with facets and significance levels, particularly this and this, I still have one more little problem.
I managed to produce the plot shown below, which is exactly what I want.
The problem I am facing now is when I have very few, or no significant comparisons; in those cases, the whole space dedicated to the brackets showing the significance levels is still preserved, but I want to get rid of it.
Please check this MWE with the iris dataset:
library(reshape2)
library(ggplot2)
data(iris)
iris$treatment <- rep(c("A","B"), length(iris$Species)/2)
mydf <- melt(iris, measure.vars=names(iris)[1:4])
mydf$treatment <- as.factor(mydf$treatment)
mydf$variable <- factor(mydf$variable, levels=sort(levels(mydf$variable)))
mydf$both <- factor(paste(mydf$treatment, mydf$variable), levels=(unique(paste(mydf$treatment, mydf$variable))))
a <- combn(levels(mydf$both), 2, simplify = FALSE)#this 6 times, for each lipid class
b <- levels(mydf$Species)
CNb <- relist(
paste(unlist(a), rep(b, each=sum(lengths(a)))),
rep.int(a, length(b))
)
CNb
CNb2 <- data.frame(matrix(unlist(CNb), ncol=2, byrow=T))
CNb2
#new p.values
pv.df <- data.frame()
for (gr in unique(mydf$Species)){
for (i in 1:length(a)){
tis <- a[[i]] #variable pair to test
as <- subset(mydf, Species==gr & both %in% tis)
pv <- wilcox.test(value ~ both, data=as)$p.value
ddd <- data.table(as)
asm <- as.data.frame(ddd[, list(value=mean(value)), by=list(both=both)])
asm2 <- dcast(asm, .~both, value.var="value")[,-1]
pf <- data.frame(group1=paste(tis[1], gr), group2=paste(tis[2], gr), mean.group1=asm2[,1], mean.group2=asm2[,2], log.FC.1over2=log2(asm2[,1]/asm2[,2]), p.value=pv)
pv.df <- rbind(pv.df, pf)
}
}
pv.df$p.adjust <- p.adjust(pv.df$p.value, method="BH")
colnames(CNb2) <- colnames(pv.df)[1:2]
# merge with the CN list
pv.final <- merge(CNb2, pv.df, by.x = c("group1", "group2"), by.y = c("group1", "group2"))
# fix ordering
pv.final <- pv.final[match(paste(CNb2$group1, CNb2$group2), paste(pv.final$group1, pv.final$group2)),]
# set signif level
pv.final$map.signif <- ifelse(pv.final$p.adjust > 0.05, "", ifelse(pv.final$p.adjust > 0.01,"*", "**"))
# subset
G <- pv.final$p.adjust <= 0.05
CNb[G]
P <- ggplot(mydf,aes(x=both, y=value)) +
geom_boxplot(aes(fill=Species)) +
facet_grid(~Species, scales="free", space="free_x") +
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_signif(test="wilcox.test", comparisons = combn(levels(mydf$both),2, simplify = F),
map_signif_level = F,
vjust=0.5,
textsize=4,
size=0.5,
step_increase = 0.06)
P2 <- ggplot_build(P)
#pv.final$map.signif <- "" #UNCOMMENT THIS LINE TO MOCK A CASE WHERE THERE ARE NO SIGNIFICANT COMPARISONS
#pv.final$map.signif[c(1:42,44:80,82:84)] <- "" #UNCOMMENT THIS LINE TO MOCK A CASE WHERE THERE ARE JUST A COUPLE OF SIGNIFICANT COMPARISONS
P2$data[[2]]$annotation <- rep(pv.final$map.signif, each=3)
# remove non significants
P2$data[[2]] <- P2$data[[2]][P2$data[[2]]$annotation != "",]
# and the final plot
png(filename="test.png", height=800, width=800)
plot(ggplot_gtable(P2))
dev.off()
Which produces this plot:
The plot above is exactly what I want... But I am facing cases where there are no significant comparisons, or very few. In these cases, a lot of vertical space is left empty.
To exemplify those scenarios, we can uncomment the line:
pv.final$map.signif <- "" #UNCOMMENT THIS LINE TO MOCK A CASE WHERE THERE ARE NO SIGNIFICANT COMPARISONS
So when there are no significant comparisons I get this plot:
If we uncomment this other line instead:
pv.final$map.signif[c(1:42,44:80,82:84)] <- "" #UNCOMMENT THIS LINE TO MOCK A CASE WHERE THERE ARE JUST A COUPLE OF SIGNIFICANT COMPARISONS
We are in a case where there are only a couple of significant comparisons, and obtain this plot:
So my question here is:
How to adjust the vertical space to the number of significant comparisons, so no vertical space is left there?
There might be something I could change in step_increase or in y_position inside geom_signif(), so I only leave space for the significant comparisons in CNb[G]...
One option is to pre-calculate the p-values for each combination of both levels and then select only the significant ones for plotting. Since we then know up front how many are significant, we can adjust the y-ranges of the plots to account for that. However, it doesn't look like geom_signif is capable of doing only within-facet calculations for the p-value annotations (see the help for the manual argument). Thus, instead of using ggplot's faceting, we instead use lapply to create a separate plot for each Species and then use grid.arrange from the gridExtra package to lay out the individual plots as if they were faceted.
(To respond to the comments, I want to emphasize that the plots are all still created with ggplot2, but we create what would have been the three facet panels of a single plot as three separate plots and then lay them out together as if they had been faceted.)
The function below is hard-coded for the data frame and column names in the OP, but can of course be generalized to take any data frame and column names.
library(gridExtra)
library(tidyverse)
# Change data to reduce number of statistically significant differences
set.seed(2)
df = mydf %>% mutate(value=rnorm(nrow(mydf)))
# Function to generate and lay out the plots
signif_plot = function(signif.cutoff=0.05, height.factor=0.23) {
# Get full range of y-values
y_rng = range(df$value)
# Generate a list of three plots, one for each Species (these are the facets)
plot_list = lapply(split(df, df$Species), function(d) {
# Get pairs of x-values for current facet
pairs = combn(sort(as.character(unique(d$both))), 2, simplify=FALSE)
# Run wilcox test on every pair
w.tst = pairs %>%
map_df(function(lv) {
p.value = wilcox.test(d$value[d$both==lv[1]], d$value[d$both==lv[2]])$p.value
data.frame(levs=paste(lv, collapse=" "), p.value)
})
# Record number of significant p.values. We'll use this later to adjust the top of the
# y-range of the plots
num_signif = sum(w.tst$p.value <= signif.cutoff)
# Plot significance levels only for combinations with p <= signif.cutoff
p = ggplot(d, aes(x=both, y=value)) +
geom_boxplot() +
facet_grid(~Species, scales="free", space="free_x") +
geom_signif(test="wilcox.test", comparisons = pairs[which(w.tst$p.value <= signif.cutoff)],
map_signif_level = F,
vjust=0,
textsize=3,
size=0.5,
step_increase = 0.08) +
theme_bw() +
theme(axis.title=element_blank(),
axis.text.x = element_text(angle=45, hjust=1))
# Return the plot and the number of significant p-values
return(list(num_signif, p))
})
# Get the highest number of significant p-values across all three "facets"
max_signif = max(sapply(plot_list, function(x) x[[1]]))
# Lay out the three plots as facets (one for each Species), but adjust so that y-range is same
# for each facet. Top of y-range is adjusted using max_signif.
grid.arrange(grobs=lapply(plot_list, function(x) x[[2]] +
scale_y_continuous(limits=c(y_rng[1], y_rng[2] + height.factor*max_signif))),
ncol=3, left="Value")
}
Now run the function with four different significance cutoffs:
signif_plot(0.05)
signif_plot(0.01)
signif_plot(0.9)
signif_plot(0.0015)
You can try. Although the answer is similar to my answer here, I added now a function.
library(tidyverse)
library(ggsignif)
# 1. your data
set.seed(2)
df <- as.tbl(iris) %>%
mutate(treatment=rep(c("A","B"), length(iris$Species)/2)) %>%
gather(key, value, -Species, -treatment) %>%
mutate(value=rnorm(n())) %>%
mutate(key=factor(key, levels=unique(key))) %>%
mutate(both=interaction(treatment, key, sep = " "))
# 2. pairwise.wilcox.test for 1) validation and 2) to calculate the ylim
Wilcox <- df %>%
split(., .$Species) %>%
map(~tidy(pairwise.wilcox.test(.$value, .$both, p.adjust.method = "none"))) %>%
map(~filter(.,.$p.value < 0.05)) %>%
bind_rows(.id="Species") %>%
mutate(padjust=p.adjust(p.value, method = "BH"))
# 3. calculate y range
Ylim <- df %>%
summarise(Min=round(min(value)),
Max=round(max(value))) %>%
mutate(Max=Max+0.5*group_by(Wilcox, Species) %>% count() %>% with(.,max(n)))
%>% c()
# 4. the plot function
foo <- function(df, Ylim, Signif=0.05){
P <- df %>%
ggplot(aes(x=both, y=value)) +
geom_boxplot(aes(fill=Species)) +
facet_grid(~Species) +
ylim(Ylim$Min, Ylim$Max)+
theme(axis.text.x = element_text(angle=45, hjust=1)) +
geom_signif(comparisons = combn(levels(df$both),2,simplify = F),
map_signif_level = F, test = "wilcox.test" ) +
stat_summary(fun.y=mean, geom="point", shape=5, size=4) +
xlab("")
# 5. remove not significant values and add step increase
P_new <- ggplot_build(P)
P_new$data[[2]] <- P_new$data[[2]] %>%
filter(as.numeric(as.character(annotation)) < 0.05) %>%
group_by(PANEL) %>%
mutate(index=(as.numeric(group[drop=T])-1)*0.5) %>%
mutate(y=y+index,
yend=yend+index) %>%
select(-index) %>%
as.data.frame()
# the final plot
plot(ggplot_gtable(P_new))
}
foo(df, Ylim)
trying other data
set.seed(12345)
df <- as.tbl(iris) %>%
mutate(treatment=rep(c("A","B"), length(iris$Species)/2)) %>%
gather(key, value, -Species, -treatment) %>%
mutate(value=rnorm(n())) %>%
mutate(key=factor(key, levels=unique(key))) %>%
mutate(both=interaction(treatment, key, sep = " "))
foo(df, list(Min=-3,Max=5))
Ofcourse you can add the Ylim calculation to the function as well. In addition you can change or add ggtitel(), ylab() and change the color.

Resources