I'm struggling to reorder my data for plotting with ggplot in a function that also uses dplyr:
# example data
library(ggplot2)
library(dplyr)
dat <- data.frame(a = c(rep("l", 10), rep("m", 5), rep("o", 15)),
b = sample(100, 30),
c= c(rep("q", 10), rep("r", 5), rep("s", 15)))
Here are my steps outside of a function:
# set a variable
colm <- "a"
# make a table
dat1 <- dat %>%
group_by_(colm) %>%
tally(sort = TRUE)
# put in order and plot
ggplot(dat2, aes(x = reorder(a, n), y = n)) +
geom_bar(stat = "identity")
But when I try to make that into a function, I can't seem to use reorder:
f <- function(the_data, the_column){
dat %>% group_by_(the_column) %>%
tally(sort = TRUE) %>%
ggplot(aes_string(x = reorder(the_column, 'n'), y = 'n')) +
geom_bar(stat = "identity")
}
f(dat, "a")
Warning message:
In mean.default(X[[i]], ...) :
argument is not numeric or logical: returning NA
The function will work without reorder:
f <- function(the_data, the_column){
dat %>% group_by_(the_column) %>%
tally(sort = TRUE) %>%
ggplot(aes_string(x = the_column, y = 'n')) +
geom_bar(stat = "identity")
}
f(dat, "a")
And I can get what I want without dplyr, but I'd prefer to use dplyr because it's more efficient in my actual use case:
# without dplyr
ff = function(the_data, the_column) {
data.frame(table(the_data[the_column])) %>%
ggplot(aes(x = reorder(Var1, Freq), y = Freq)) +
geom_bar(stat = "identity") +
ylab("n") +
xlab(the_column)
}
ff(dat, "a")
I see that others have struggled with this (1, 2), but it seems there must be a more efficient dplyr/pipe idiom for this reordering-in-a-function task.
If you are going to use aes_string, then the whole value must be a string, not just partially a string. You can use paste() to help build the expression you want to use for x. For example
f <- function(the_data, the_column){
dat %>% group_by_(the_column) %>%
tally(sort = TRUE) %>%
ggplot(aes_string(x = paste0("reorder(",the_column,", n)"), y = 'n')) +
geom_bar(stat = "identity")
}
Or you could use expressions rather than strings
f <- function(the_data, the_column){
dat %>% group_by_(the_column) %>%
tally(sort = TRUE) %>%
ggplot(aes_q(x = substitute(reorder(x, n),list(x=as.name(the_column))), y = quote(n))) +
geom_bar(stat = "identity")
}
but the general idea is that you need to be careful when mixing strings and raw language elements (like names or expressions).
Related
I have a large dataset in which I want to group similar resistance patterns together. A plot to visualize similarity of resistance pattern is needed.
dat <- read.table(text="Id Resistance.Pattern
A SSRRSSSSR
B SSSRSSSSR
C RRRRSSRRR
D SSSSSSSSS
E SSRSSSSSR
F SSSRRSSRR
G SSSSR
H SSSSSSRRR
I RRSSRRRSS", header=TRUE)
I would separate out the values into a wider dataframe and then make a heatmap and dendrogram to compare sillimanites in patterns:
library(tidyverse)
library(ggdendro)
recode_dat <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
select(starts_with("pat_")) |>
mutate(across(everything(), ~case_when(. == "S" ~ 1, . == "R" ~ 2, is.na(.) ~0)))
rownames(recode_dat) <- dat$Id
dendro <- as.dendrogram(hclust(d = dist(x = scale(recode_dat))))
dendro_plot <- ggdendrogram(data = dendro, rotate = TRUE)
heatmap_plot <- dat |>
mutate(pat = str_split(Resistance.Pattern, "")) |>
unnest_wider(pat, names_sep = "_") |>
pivot_longer(cols = starts_with("pat_"), names_to = "pattern_position") |>
mutate(Id = factor(Id, levels = dat$Id[order.dendrogram(dendro)])) |>
ggplot(aes(pattern_position, Id))+
geom_tile(aes(fill = value))+
scale_x_discrete(labels = \(x) sub(".*_(\\d+$)", "\\1", x))+
theme(legend.position = "top")
cowplot::plot_grid(heatmap_plot, dendro_plot,nrow = 1, align = "h", axis = "tb")
It sounds as though the second column of your data frame represents sensitivity (S) and resistance (R), presumably to antibiotics (though this is not clear in your question). That being the case, you are presumably looking for something like this:
library(tidyverse)
p <- strsplit(dat$Resistance.Pattern, "")
do.call(rbind, lapply(p, \(x) c(x, rep(NA, max(lengths(p)) - length(x))))) %>%
as.data.frame() %>%
cbind(Id = dat$Id) %>%
mutate(Id = factor(Id, rev(Id))) %>%
pivot_longer(V1:V9) %>%
ggplot(aes(name, Id, fill = value)) +
geom_tile(col = "white", size = 2) +
coord_equal() +
scale_fill_manual(values = c("#e02430", "#d8d848"),
labels = c("Resistant", "Sensitive"),
na.value = "gray95") +
scale_x_discrete(name = "Antibiotic", position = "top",
labels = 1:9) +
labs(fill = "Resistance", y = "ID") +
theme_minimal(base_size = 20) +
theme(text = element_text(color = "gray30"))
I'd separate the entries by character, convert the binary data to numeric and plot the matrix as a heatmap and show the character string as rownames.
Whether to use a row and/or column clustering depends on whats desired.
library(dplyr)
library(tidyr) # for unnest_wider
library(gplots) # for heatmap.2
mm <-
dat %>%
group_by(Resistance.Pattern) %>%
summarize(Id, Resistance.Pattern) %>%
mutate(binary = strsplit(Resistance.Pattern, "")) %>%
unnest_wider(binary, names_sep="") %>%
mutate(across(starts_with("binary"), ~ as.numeric(c(R = 1, S = 0)[.x])))
mm2 <- as.matrix(mm[, -c(1,2)]) |> unname() # the numeric part
rownames(mm2) <- apply(as.matrix(mm[,1:2]), 1, paste, collapse=" ")
heatmap.2(mm2, trace="none", Colv="none", dendrogram="row",
col=c("green", "darkgreen"), margins=c(10,10))
Consider the following example:
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dplyr)
df <- data.frame(x = seq(0, 70, 0.5),
y = seq(0, 70, 0.5),
val = rnorm(141),
group =rep(1:3,47))
max_val_plot <- df$val %>% max() %>% round(0)
min_val_plot <--df$val %>% min() %>% round(0)
breaks_plot <-seq(min_val_plot,max_val_plot,0.1)
n <- breaks_plot %>% length()
getPalette <- colorRampPalette(brewer.pal(9, "RdBu"))
colors_plot <-getPalette(n)
labels_plot <- breaks_plot %>%
as.character()
labels_plot[!1:0]=' '
df %>%
ungroup() %>%
ggplot(aes(x=x,y=y,fill=val))+
geom_raster()+
facet_grid(~group)+
theme_bw(base_size = 20)+
scale_fill_stepsn(
name = "",
colours = colors_plot,
breaks = breaks_plot,
labels = labels_plot
)
Although labels and breaks are of equal length, the error "Breaks and labels are different lengths" is returned due to the presence of multiple groups and the faceted function in plotting code.
How can I fix this?
Thanks!
One option to fix your issue would be to pass a function to the labels argument of scale_fill_xxx to create the labels on the fly instead of providing the labels as a vector.
library(ggplot2)
library(RColorBrewer)
library(magrittr)
library(dplyr)
set.seed(123)
df %>%
ungroup() %>%
ggplot(aes(x = x, y = y, fill = val)) +
geom_raster() +
facet_grid(~group) +
theme_bw(base_size = 20) +
scale_fill_stepsn(
name = "",
colours = colors_plot,
breaks = breaks_plot,
labels = function(x) { x <- as.character(x); x[!1:0] <- " "; x}
)
The function is:
violin_rating_by_time <- function(df,var,color,title){
label <- df %>%
filter(!is.na(Decade))%>%
filter(!is.na(!!as.name(var)))%>%
mutate(Decade = fct_reorder(factor(Decade),Release.Date))%>%
group_by(Decade)%>%
summarise(temp=median(!!as.name(var)))
names(label)[names(label)=="temp"] <- var
plot <- df %>%
filter(!is.na(Decade))%>%
mutate(Decade = fct_reorder(factor(Decade),Release.Date))%>%
ggplot(aes(x=Decade,y=!!as.name(var)))+
geom_violin(color=!!as.character(color),fill=!!as.character(color),position="dodge",outlier.colour="transparent",alpha = 0.4,draw_quantiles = c(0.5))+
geom_label(data = label, aes(label = !!as.name(var)),color=!!as.character(color),vjust=-0.4,hjust=0.9)+
theme_aes()+
theme(legend.position = "NA") +
ggtitle(title)
return(plot)
}
I pass arguments into the function (df is just a dataframe Im using, not too important.)
violin_rt <- violin_rating_by_time(df=df
,var="Metacritic.Score"
,color="#EDDDD4"
,title="Distribution of MC Scores by Decade")
I get the following error message
Error in !as.character(color) : invalid argument type
its coming from the geom_violin function. when I plot without colour, it works fine.
Thanks!
edit: theme_aes() is function I use for some basic theme arguments in ggplot. its not causing the error.
Not sure root cause, but it works for me if the !!as.character(color)'s are replaced with {{ color }}.
violin_rating_by_time <- function(df,var,color,title){
label <- df %>%
filter(!is.na(Decade))%>%
filter(!is.na(!!as.name(var)))%>%
mutate(Decade = fct_reorder(factor(Decade),Release.Date))%>%
group_by(Decade)%>%
summarise(temp=median(!!as.name(var)))
names(label)[names(label)=="temp"] <- var
plot <- df %>%
filter(!is.na(Decade))%>%
mutate(Decade = fct_reorder(factor(Decade),Release.Date))%>%
ggplot(aes(x=Decade,y=!!as.name(var)))+
geom_violin(color= {{ color }},fill= {{ color }},position="dodge",alpha = 0.4,draw_quantiles = c(0.5))+
geom_label(data = label, aes(label = !!as.name(var)),color= {{ color }},vjust=-0.4,hjust=0.9)+
theme(legend.position = "NA") +
ggtitle(title)
return(plot)
}
Fake data
df <- data.frame(Decade = 2000,
Metacritic.Score = rnorm(10, 10),
Release.Date = 2005)
Test
violin_rating_by_time(df=df
,var="Metacritic.Score"
,color="#EDDDD4"
,title="Distribution of MC Scores by Decade")
There is no need for !! or {{. First, color is already a color code passed as character string. So you could simply pass it to the color and/or fill argument in geom_violin as is. Additionally, !! will only work within a quasiquotation context, e.g. inside a dplyr pipeline:
library(dplyr)
dd <- data.frame(x = 1)
foo <- "foo"
## This works
dd %>%
mutate(y = !!as.character(foo))
#> x y
#> 1 1 foo
## This does not work
dd$z <- !!as.character(foo)
#> Error in !as.character(foo): invalid argument type
Second, when you pass a column name as a character string you could access the column inside a dplyr pipeline or inside aes() via the .data pronoun which is also the recommended way to do so. See Programming with dplyr and Best practices for programming with ggplot2.
Making use of the example data provided by #JonSpring:
violin_rating_by_time <- function(df, var, color, title) {
label <- df %>%
filter(!is.na(Decade)) %>%
filter(!is.na(.data[[var]])) %>%
mutate(Decade = fct_reorder(factor(Decade), Release.Date)) %>%
group_by(Decade) %>%
summarise(temp = median(.data[[var]]))
names(label)[names(label) == "temp"] <- var
df %>%
filter(!is.na(Decade)) %>%
mutate(Decade = fct_reorder(factor(Decade), Release.Date)) %>%
ggplot(aes(x = Decade, y = .data[[var]])) +
geom_violin(color = color, fill = color, position = "dodge", outlier.colour = "transparent", alpha = 0.4, draw_quantiles = c(0.5)) +
geom_label(data = label, aes(label = .data[[var]]), color = color, vjust = -0.4, hjust = 0.9) +
#theme_aes() +
theme(legend.position = "NA") +
ggtitle(title)
}
library(ggplot2)
library(dplyr)
library(forcats)
set.seed(42)
df <- data.frame(Decade = 2000,
Metacritic.Score = rnorm(10, 10),
Release.Date = 2005)
violin_rating_by_time(
df = df,
var = "Metacritic.Score",
color = "#EDDDD4",
title = "Distribution of MC Scores by Decade"
)
#> Warning: Ignoring unknown parameters: outlier.colour
Here's a code block:
# scale the log of price per group (cut)
my_diamonds <- diamonds %>%
mutate(log_price = log(price)) %>%
group_by(cut) %>%
mutate(scaled_log_price = scale(log_price) %>% as.numeric) %>% # scale within each group as opposed to overall
nest() %>%
mutate(mean_log_price = map_dbl(data, ~ .x$log_price %>% mean)) %>%
mutate(sd_log_price = map_dbl(data, ~ .x$log_price %>% sd)) %>%
unnest %>%
select(cut, price, price_scaled:sd_log_price) %>%
ungroup
# for each cut, find the back transformed actual values (exp) of each unit of zscore between -3:3
for (i in -3:3) {
my_diamonds <- my_diamonds %>%
mutate(!! paste0('mean_', ifelse(i < 0 , 'less_', 'plus_'), abs(i), 'z') := map2(.x = mean_log_price, .y = sd_log_price, ~ (.x + (i * .y)) %>% exp) %>% unlist)
}
my_diamonds_split <- my_diamonds %>% group_split(cut)
split_names <- my_diamonds %>% mutate(cut = as.character(cut)) %>% group_keys(cut) %>% pull(cut)
names(my_diamonds_split) <- split_names
I now have a variable my_diamonds_split that is a list of data frames. I would like to loop over these data frames and each time create a new ggplot.
I can use a custom labeller function with a single df, but I don't know how to do this within a loop:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(ex_df$price) * x + mean(ex_df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
This creates a plot for the 'Ideal' cut of diamonds. I also get two data points on the x axis, the zscore values at -2, 0 and 2 as well as the raw dollar values of 3.8K, 3.9K and 11.8K.
When I define the labeller function, I must specify the df to scale with. Tried instead with placing the dot instead of my_df, hoping that on each iteration ggplot would get the value of the df on any iteration:
labeller <- function(x) {
paste0(x,"\n", scales::dollar(sd(.$price) * x + mean(.$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, limits = c(-3, 3))
Returns:
Error in is.data.frame(x) : object '.' not found
I then tried writing the function to accept an argument for the df to scale with:
labeller <- function(x, df) {
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
ex_df <- my_diamonds_split$Ideal
ex_df %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller(df = ex_df), limits = c(-3, 3)) # because when it comes to running in real life, I will try something like labeller(df = my_diamonds_split[[i]])
Error in paste0(x, "\n", scales::dollar(sd(df$price) * x + mean(df$price))) :
argument "x" is missing, with no default
Bearing in mind that the scaling must be done per iteration, how could I loop over my_diamonds_split, and on each iteration generate a ggplot per above?
labeller <- function(x) {
# how can I make df variable
paste0(x,"\n", scales::dollar(sd(df$price) * x + mean(df$price)))
}
for (i in split_names) {
my_diamonds_split[[i]] %>%
ggplot(aes(x = scaled_log_price)) +
geom_density() +
scale_x_continuous(label = labeller, # <--- here, labeller must be defined with df$price except that will difer on each iteration
limits = c(-3, 3))
}
There's a hacky way to get this result in facets. Basically, after converting to z scores, you add different amounts (say, multiples of 1000) to each group's z scores. Then you set all the breaks to this collection of points and label them with pre-calculated labels.
library(ggplot2)
library(dplyr)
f <- function(x) {
y <- diamonds$price[diamonds$cut == x]
paste(seq(-3, 3), scales::dollar(round(mean(y) + seq(-3, 3) * sd(y))), sep = "\n")
}
breaks <- as.vector(sapply(levels(diamonds$cut), f))
diamonds %>%
group_by(cut) %>%
mutate(z = scale(price) + 3 + 1000 * as.numeric(cut)) %>%
ggplot(aes(z)) +
geom_point(aes(x = z - 2, y = 1), alpha = 0) +
geom_density() +
scale_x_continuous(breaks = as.vector(sapply(1:5 * 1000, "+", 0:6)),
labels = breaks) +
facet_wrap(vars(cut), scales = "free_x") +
theme(text = element_text(size = 16),
axis.text.x = element_text(size = 6))
You would have to increase the plot size to make the dollar values more visible of course.
Created on 2020-08-04 by the reprex package (v0.3.0)
Lets say, in R, I have a data frame letters, numbers and animals and I want to examine the relationship between all three graphically. I could do something like.
library(dplyr)
library(ggplot2)
library(gridExtra)
set.seed(33)
my_df <- data.frame(
letters = c(letters[1:10], letters[6:15], letters[11:20]),
animals = c(rep('sheep', 10), rep('cow', 10), rep('horse', 10)),
numbers = rnorm(1:30)
)
ggplot(my_df, aes(x = letters, y = numbers)) + geom_point() +
facet_wrap(~animals, ncol = 1, scales = 'free_x')
I'd get something that looks like.
However, I want the order of the x axis to be dependent on the order of the y-axis. This is easy enough to do without facets, as per this example.
I can even make an ordered figure for each animal and then bind them together with grid.arrange as in this example
my_df_shp <- my_df %>% filter(animals == 'sheep')
my_df_cow <- my_df %>% filter(animals == 'cow')
my_df_horse <- my_df %>% filter(animals == 'horse')
my_df_shp1 <- my_df_shp %>% mutate(letters = reorder(letters, numbers))
my_df_cow1 <- my_df_cow %>% mutate(letters = reorder(letters, numbers))
my_df_horse1 <- my_df_horse %>% mutate(letters = reorder(letters, numbers))
p_shp <- ggplot(my_df_shp1, aes(x = letters, y = numbers)) + geom_point()
p_cow <- ggplot(my_df_cow1, aes(x = letters, y = numbers)) + geom_point()
p_horse <- ggplot(my_df_horse1, aes(x = letters, y = numbers)) + geom_point()
grid.arrange(p_shp, p_cow, p_horse, ncol = 1)
I don't particularly like this solution though, because it isn't easily generalizable to cases where there are a lot of facets.
I'd rather do something like
ggplot(my_df, aes(x = y_ordered_by_facet(letters, by = numbers), y = numbers)) + geom_point() +
facet_wrap(~animals, ncol = 1, scales = 'free_x')
Where y_ordered is some function that cleverly orders the letters factor to be in the same order as the numbers.
Something that gets close to this, but doesn't quite seem to work is
ggplot(my_df, aes(x = reorder(letters, numbers), y = numbers)) +
geom_point() + facet_wrap(~animals, ncol = 1, scales = 'free_x')
That doesn't quite work because the order ends up taking effect before, rather than after the facet wrapping and thus putting the labels in not quite the right order for each panel.
Any clever ideas?
I've found dplyr doesn't work super well with group_by() when dealing with different factor levels in each of the groups. So one work around is thinking of creating a new factor that's unique for each animal-letter combination and ordering that. First, we create an interaction variable with animal+letter and determine the proper order for each of the letters for the animals
new_order <- my_df %>%
group_by(animals) %>%
do(data_frame(al=levels(reorder(interaction(.$animals, .$letters, drop=TRUE), .$numbers)))) %>%
pull(al)
Now we create the interaction variable in the data we want to plot, use this new ordering, and finally change the labels so they look like just the letters again
my_df %>%
mutate(al=factor(interaction(animals, letters), levels=new_order)) %>%
ggplot(aes(x = al, y = numbers)) +
geom_point() + facet_wrap(~animals, ncol = 1, scales = 'free_x') +
scale_x_discrete(breaks= new_order, labels=gsub("^.*\\.", "", new_order))
set.seed(33)
my_df <- data.frame(
letters = c(letters[1:10], letters[6:15], letters[11:20]),
animals = c(rep('sheep', 10), rep('cow', 10), rep('horse', 10)),
numbers = rnorm(1:30)
)
my_df %>% group_by(animals) %>%
arrange(numbers, .by_group = T) %>%
mutate(lett = factor(interaction(animals,letters, drop=TRUE))) -> my_df
ggplot(my_df, aes(x = reorder(lett, numbers), y = numbers)) +
geom_point(size = 3) +
facet_wrap(~animals, ncol = 1, scales = 'free_x') +
scale_x_discrete(breaks = my_df$lett, labels=gsub("^.*\\.", "", my_df$lett))