Passing params to distribution in geom_qq with facets - r

I'd like to create QQ-plots of a t-distribution using ggplot2's geom_qq() function. Hadley provides a nice example of how to do this here, but it's only for a single distribution. I wish to extend this to multiple groups with a facet and distribution for each group. I found a similar and related question here, but it doesn't really answer the question.
Passing either a list or vector of greater than length 1 does not seem to work.
library(ggplot2)
a <- 1:10
df <- data.frame(a = a, b = rt(1000, df = a))
deg_free <-
lapply(a, function(x) {
return(MASS::fitdistr(subset(df, a == x)$b,
"t")$estimate["df"])
})
g <-
ggplot(data=df, aes(sample=b)) +
geom_qq(distribution = qt, dparams = deg_free) +
geom_qq_line(distribution = qt, dparams = deg_free) +
facet_wrap(~a)
Does anyone know how to do this without resorting to computing quantiles for the data and manually plotting the QQ points and lines?

For ggplot to take degrees of freedom into account in facets, the dataframe passed into ggplot() should contain that as a column:
library(dplyr)
set.seed(123) # for reproducibility
a <- 1:10
df <- data.frame(a = a, b = rt(1000, df = a))
deg_free <-
lapply(a, function(x) {
return(MASS::fitdistr(subset(df, a == x)$b,
"t")$estimate["df"])
})
df <- df %>%
left_join(data.frame(d = unlist(deg_free), a = a),
by = "a")
rm(a, deg_free)
> head(df)
a b d
1 1 -0.2624269 1.526920
2 2 -3.4784976 1.447293
3 3 1.6535141 2.819679
4 4 2.3848622 3.240377
5 5 0.4233105 3.946170
6 6 1.4423866 5.893569
With that out of the way, we can try to define modified versions of geom_qq / geom_qq_line that look for degrees of freedom df as a mapped aesthetic. Here's how the result can look like:
ggplot(df,
aes(sample=b, df = d)) +
geom_qq2(distribution = qt) +
geom_qq_line2(distribution = qt) +
facet_wrap(~a, scales = "free")
Code to create geom_qq2 / geom_qq_line2:
library(magrittr)
library(ggplot2)
# take reference from the compute_group functions for StatQq / StatQqLine
# but modify the code to include df in dparams, if it's a mapped aesthetic
compute_group_StatQq2 <- environment(StatQq$compute_group)$f
compute_group_StatQqLine2 <- environment(StatQqLine$compute_group)$f
body(compute_group_StatQq2) <- body(compute_group_StatQq2) %>% as.list() %>%
append(quote(if("df" %in% colnames(data)) dparams <- append(dparams, list("df" = data$df[1]))),
after = 1L) %>%
as.call()
body(compute_group_StatQqLine2) <- body(compute_group_StatQqLine2) %>% as.list() %>%
append(quote(if("df" %in% colnames(data)) dparams <- append(dparams, list("df" = data$df[1]))),
after = 1L) %>%
as.call()
# define modified ggproto classes
# which inherit from StatQq / StatQqLine, but use modified compute_group functions
StatQq2 <- ggproto("StatQq2", StatQq, compute_group = compute_group_StatQq2)
StatQqLine2 <- ggproto("StatQqLine2", StatQqLine, compute_group = compute_group_StatQqLine2)
# define modified geom functions
# which are based on geom_qq / geom_qq_line, but use Stat = modified Stat
geom_qq2 <- geom_qq
geom_qq_line2 <- geom_qq_line
body(geom_qq2) <- body(geom_qq) %>% as.list() %>%
inset2(2, (.) %>% extract2(2) %>% as.list() %>%
modifyList(val = list(stat = quote(StatQq2))) %>%
as.call()) %>%
as.call()
body(geom_qq_line2) <- body(geom_qq_line2) %>% as.list() %>%
inset2(2, (.) %>% extract2(2) %>% as.list() %>%
modifyList(val = list(stat = quote(StatQqLine2))) %>%
as.call()) %>%
as.call()
Code used to modify the body of a function took reference from MrFlick's answer to How to insert expression into the body of a function in R.
Disclaimer: I've never used geom_qq** before today. If I've missed out things while modifying the computation functions in StatQq**, let me know & I'll try to sort them out.

I don't think geom_qq is set up to handle having different parameters per facet, so the way to do this might be to produce a plot separately for each subset of the data and combine them with something like cowplot::plot_grid:
library(tidyverse)
plots = df %>%
group_by(a) %>%
mutate(deg_free = MASS::fitdistr(b, "t")$estimate["df"]) %>%
# This second group_by is just used to keep the deg_free value
# in the final dataframe, could be removed
group_by(a, deg_free) %>%
do(
plot = ggplot(data=., aes(sample=b)) +
geom_qq(distribution = qt, dparams = list(.$deg_free)) +
geom_qq_line(distribution = qt, dparams = list(.$deg_free)) +
ggtitle(.$a)
)
# Using map to unpack the list-column into a list, there's
# probably a better way
cowplot::plot_grid(plotlist=map(plots$plot, ~ .))
Example output:

Related

ECDF plot for all columns in the dataframe in R

I need to plot ECDF's of all columns of the dataframe in one single plot and get an x_limit on the axis too.
The function that I wrote:
library(lattice)
library(latticeExtra)
ecdf_plot <- function(data){
# Drop columns with only NA's
data <- data[, colSums(is.na(data)) != nrow(data)]
data$P_key <- NULL
ecdfplot(~ S12, data=data, auto.key=list(space='right'))
}
Problem:
The ECDF in the above function only plots for the column S12 but I need this for all columns in the dataframe. I know i can do S12 + S13 + ... but the source data changes and we don't exactly know how many and which columns will the dataframe get. Is there a better way out of this? Also, is it possible to get the x_limit for the combined plot to be just one range like xlim(0,100)?
I think this task would be easier using ggplot. It would be very easy to set the limits as required, customise the appearance, etc.
The function would look like this:
library(dplyr)
library(tidyr)
library(ggplot2)
ecdf_plot <- function(data) {
data[, colSums(is.na(data)) != nrow(data)] %>%
pivot_longer(everything()) %>%
group_by(name) %>%
arrange(value, by_group = TRUE) %>%
mutate(ecdf = seq(1/n(), 1 - 1/n(), length.out = n())) %>%
ggplot(aes(x = value, y = ecdf, colour = name)) +
xlim(0, 100) +
geom_step() +
theme_bw()
}
Now let's test it on a random data frame:
set.seed(69)
df <- data.frame(unif = runif(100, 0, 100), norm = rnorm(100, 50, 25))
ecdf_plot(df)

Select variables in one data frame to plot from another

I have large data sets where I am doing exploratory screenings for correlations. I want to do a correlation test to identify significantly related variables, and then plot these variables against each other.
data <- data.frame(a = 1:10, b = c(1.5*(1:9), 10), c = 2*(1:10), d = sample(1:5, 10, replace = T))
cor_data <- corr.test(data)
sig_cor <- ifelse(cor_data$p <0.05, cor_data$r, NA)
sig_cor_long <- sig_cor %>%
data.frame() %>%
mutate(var1 = rownames(sig_cor)) %>%
gather(var2, value = r, -var1) %>%
drop_na(r) %>%
filter(r != 1)
This identifies pairs a-b and b-c as significantly correlated, so I want to plot those. How can I automate this process of selecting the paired variables from sig_cor_long to plot via ggplot from data? An example plot that I want to create for each correlated pair would be:
ggplot(data, aes(a, b)) +
geom_smooth(method = 'lm')+
geom_point(shape = 21, color = 'darkblue', fill = 'white')
I want to have a function to input into ggplot to tell it to plot all the var1 and var2 pairs identified in sig_cor_long for which the raw data are in data.
Ok so, this is one way of plotting e.g. all the plots where there is significant correlation (in a list, so you could do anything with them)
do.call(gridExtra::grid.arrange,
ifelse(cor_data$p <0.05, cor_data$r, NA) %>%
as.data.frame() %>%
rownames_to_column() %>%
gather(pair, val, -rowname) %>%
drop_na() %>%
filter(val != 1) %$%
map2(rowname, pair, ~ggplot() + geom_smooth(method = "lm", aes(data[, .x], data[, .y])) + geom_point(aes(data[, .x], data[, .y])))
)

Functional programming with dplyr

Looking for a more efficient / elegant way to pass multiple arguments to a group-by using non-standard evaluation in a function using dplyr. I don't want to use the ... operator, but to specify the functions individually.
My specific use case is a function which takes a data frame and creates a ggplot object with simpler syntax. Here is an example of the code I want to automate with my function:
# create data frame
my_df <- data.frame(month = sample(1:12, 1000, replace = T),
category = sample(head(letters, 3), 1000, replace = T),
approved = as.numeric(runif(1000) < 0.5))
my_df$converted <- my_df$approved * as.numeric(runif(1000) < 0.5)
my_df %>%
group_by(month, category) %>%
summarize(conversion_rate = sum(converted) / sum(approved)) %>%
ggplot + geom_line(aes(x = month, y = conversion_rate, group = category,
color = category))
I want to combine that group_by, summarize, ggplot, and geom_line into a simple function that I can feed an x, y, and group, and have it perform all the dirty work under the hood. Here's what I've gotten to work:
# create the function that does the grouping and plotting
plot_lines <- function(df, x, y, group) {
x <- enquo(x)
group <- enquo(group)
group_bys <- quos(!! x, !! group)
df %>%
group_by(!!! group_bys) %>%
my_smry %>%
ggplot + geom_line(aes_(x = substitute(x), y = substitute(y),
group = substitute(group), color = substitute(group)))
}
# create a function to do the summarization
my_smry <- function(x) {
x %>%
summarize(conversion_rate = sum(converted) / sum(approved))
}
# use my function
my_df %>%
plot_lines(x = month, y = conversion_rate, group = category)
I feel like the group_by handling is pretty inelegant: quoting x and group with enquo, then unquoting them with !! inside of another quoting function quos, only to re-unquote them with !!! on the next line, but it's the only thing I've been able to get to work. Is there a better way to do this?
Also, is there a way to get ggplot to take !! instead of substitute? What I'm doing feels inconsistent.
You could just do a straight eval.parent(substitute(...)) like this. Being base R it works consistently across R and is simple to do. One can even use an ordinary aes.
plot_lines <- function(df, x, y, group) eval.parent(substitute(
df %>%
group_by(x, group) %>%
my_smry %>%
ggplot + geom_line(aes(x = x, y = y, group = group, color = group))
))
plot_lines(my_df, month, conversion_rate, category)
The problem is that ggplot hasn't been updated to handle quosures yet, so you've got to pass it expressions, which you can create from quosures with rlang::quo_expr:
library(tidyverse)
set.seed(47)
my_df <- data_frame(month = sample(1:12, 1000, replace = TRUE),
category = sample(head(letters, 3), 1000, replace = TRUE),
approved = as.numeric(runif(1000) < 0.5),
converted = approved * as.numeric(runif(1000) < 0.5))
plot_lines <- function(df, x, y, group) {
x <- enquo(x)
y <- enquo(y)
group <- enquo(group)
df %>%
group_by(!! x, !! group) %>%
summarise(conversion_rate = sum(converted) / sum(approved)) %>%
ggplot(aes_(x = rlang::quo_expr(x),
y = rlang::quo_expr(y),
color = rlang::quo_expr(group))) +
geom_line()
}
my_df %>% plot_lines(month, conversion_rate, category)
However, keep in mind that ggplot will almost inevitably be updated from lazyeval to rlang, so while this interface will probably keep working, a simpler, more consistent one will probably be possible shortly.

Using purrr to plot multiple items against each other

I am trying to learn purrr from the tidyverse
I have set up a piece of code to attempt to plot all variables in the iris data-set against each other to see if they are linearly related. Unfortunately I don't seem to get anything back except blank plots. Below is my example. Can anyone help
library(tidyverse)
mydf <- iris %>%
as_tibble %>%
dplyr::select(everything(), -Species)
# Create a grid of names of columns
mynames <- names(mydf)
mygrid <- expand.grid(x=mynames, y =mynames)
# Define function
plot_my_data <- function(mydata, x, y){
ggplot(mydata, aes(x, y)) +
geom_smooth()}
map2(.x = mygrid$x,
.y = mygrid$y,
.f = ~ plot_my_data(mydf, .x,.y))
You have 2 issues in your code.
First one is that you use aes where you should use aes_string, and second is that you have factors and not characters in mygrid.
This works:
mygrid <- expand.grid(x=mynames, y =mynames,stringsAsFactors = F)
# Define function
plot_my_data <- function(mydata, x, y){
ggplot(mydata, aes_string(x, y)) +
geom_smooth()}
map2(.x = mygrid$x,
.y = mygrid$y,
.f = ~ plot_my_data(mydf, .x,.y))

Variable column names in the pipe

I have the following code:
install.packages('tidyverse')
library(tidyverse)
x <- 1:10
y <- x^2
df <- data.frame(first_column = x, second_column = y)
tibble <- as_tibble(df)
tibble %>%
filter(second_column != 16) %>%
ggplot(aes(x = first_column, y = second_column)) +
geom_line()
Now I would like to create the following function
test <- function(colname) {
tibble %>%
filter(colname != 16) %>%
ggplot(aes(x = first_column, y = colname)) +
geom_line()
}
test('second_column')
But running it creates a vertical line instead of the function. How can I make this function work?
Edit: My focus is on getting the pipe to work, not ggplot.
In order to pass character strings for variable names, you have to use the standard evaluation version of each function. It is aes_string for aes, and filter_ for filter. See the NSE vignette for more details.
Your function could look like:
test <- function(colname) {
tibble %>%
filter_(.dots= paste0(colname, "!= 16")) %>%
ggplot(aes_string(x = "first_column", y = colname)) +
geom_line()
}

Resources