ordering and plotting by one variable conditional on a second - r

Task: I would like to reorder a factor variable by the difference between the factor variable when a second variable equals 1 and the factor variable when the second variable equals 0. Here is a reproducible example to clarify:
# Package
library(tidyverse)
# Create fake data
df1 <- data.frame(place = c("A", "B", "C"),
avg = c(3.4, 4.5, 1.8))
# Plot, but it's not in order of value
ggplot(df1, aes(x = place, y = avg)) +
geom_point(size = 4)
# Now put it in order
df1$place <- factor(df1$place, levels = df1$place[order(df1$avg)])
# Plots in order now
ggplot(df1, aes(x = place, y = avg)) +
geom_point(size = 4)
# Adding second, conditional variable (called: new)
df2 <- data.frame(place = c("A", "A", "B", "B", "C", "C"),
new = rep(0:1, 3),
avg = c(3.4, 2.3, 4.5, 4.2, 2.1, 1.8))
ggplot(df2, aes(x = place, y = avg, col = factor(new))) +
geom_point(size = 3)
Goal: I would like to order and plot the factor variable place by the difference of avg between place when new is 1 and place when new is 0

You can create the levels for the place column by:
library(tidyr)
df2$place <- factor(df2$place, levels=with(spread(df2, new, avg), place[order(`1` - `0`)]))
ggplot(df2, aes(x = place, y = avg, col = factor(new))) +
geom_point(size = 3) + labs(color = 'new')
gives:

If I understand the goal correctly, then factor A has the biggest difference:
avg(new = 0) - avg(new = 1) = 1.1
So you can spread the data frame to calculate the difference, then gather, then plot avg versus place, reordered by diff. Or if you want A first, by -diff.
But let me know if I didn't understand correctly :)
df2 %>%
spread(new, avg) %>%
mutate(diff = `0` - `1`) %>%
gather(new, avg, -diff, -place) %>%
ggplot(aes(reorder(place, diff), avg)) +
geom_point(aes(color =factor(new)), size = 3)

Calculate the column first using dplyr:
df2 %>% group_by(place) %>% mutate(diff=diff(avg))
ggplot(df2, aes(x=place, y=diff, color=diff)+
geom_point(size=3)

Related

Plot a line on a barchart in ggplot2

I have built a stacked bar chart showing the relative proportions of response to different questions. Now I want to show a particular response ontop of that barchart, to show how an individuals response relates to the overall proportions of responses.
I created a toy example here:
library(ggplot2)
n = 1000
n_groups = 5
overall_df = data.frame(
state = sample(letters[1:8], n, replace = TRUE),
frequency = runif(n, min = 0, max = 1),
var_id = rep(LETTERS[1:n_groups], each = 1000 / n_groups)
)
row = data.frame(
A = "a", B = "b", C = "c", D = "h", E = "b"
)
ggplot(overall_df,
aes(fill=state, y=frequency, x=var_id)) +
geom_bar(position="fill", stat="identity")
The goal here is to have the responses in the object row plotted as a point in the corresponding barchart box, with a line connecting the points.
Here is a (poorly drawn) example of the desired result. Thanks for your help.
This was trickier than I thought. I'm not sure there's any way round manually calculating the x/y co-ordinates of the line.
library(dplyr)
library(ggplot2)
df <- overall_df %>% group_by(state, var_id) %>%
summarize(frequency = sum(frequency))
freq <- unlist(Map(function(d, val) {
(sum(d$frequency[d$state > val]) + 0.5 * d$frequency[d$state == val]) /
sum(d$frequency)
}, d = split(df, df$var_id), val = row))
line_df <- data.frame(state = unlist(row),
frequency = freq,
var_id = names(row))
ggplot(df, aes(fill=state, y=frequency, x=var_id)) +
geom_col(position="fill") +
geom_line(data = line_df, aes(group = 1)) +
geom_point(data = line_df, aes(group = 1))
Created on 2022-03-08 by the reprex package (v2.0.1)
Here's an automated approach using dplyr. I prepare the summary by joining the label data to the original data, and then using group_by + summarize to get those.
library(dplyr)
row_df <- data.frame(state = letters[1:n_groups], var_id = LETTERS[1:n_groups])
line_df <- row_df %>%
left_join(overall_df, by = "var_id") %>%
group_by(var_id) %>%
summarize(state = last(state.x),
frequency = (sum(frequency[state.x < state.y]) +
sum(frequency[state.x == state.y])/2) / sum(frequency))
ggplot(overall_df, aes(fill=state, y=frequency, x=var_id)) +
geom_bar(position="fill", stat="identity") +
geom_point(data = line_df) +
geom_line(data = line_df, aes(group = 1))

R / Tidyverse: Ordering factors within group with duplicate labels and plotting using facet_wrap

I am trying to plot a graph with ggplot where I facet on one variable, and make two plots (one for each variable showing a value (x) for a category (y), where y is plotted in descending order with respect to X within each group. The issue here is that in each group, the same y label exists and this seems to mess up the factor levels.
Setting up the data:
dummy_data <- tibble(
y_var = c('v1', 'v2', 'v3', 'v4', 'v5', 'v2', 'v6', 'v7', 'v4', 'v7'),
x_var = c(0.0629, 0.0631, 0.0654, 0.0657, 0.0676, 0.0693, 0.0707, 0.0728, 0.0733, 0.0868),
group_var = c("A", "B", "B", "A", "B", "A", "A", "A", "B", "B")
) %>%
group_by(group_var) %>% # group by the grouping variable
arrange(x_var, .by_group = TRUE) %>% # arrange the rows by x_var, within each group
mutate(y_var = factor(y_var, levels = y_var)) # change the y variable to a factor to retain order when plotting
dummy_data %>%
ggplot() +
geom_point(aes(x = x_var, y = y_var, color = group_var), size = 5) +
facet_wrap(~group_var, scales = 'free', dir = 'v')
The output, shown below, is not ordered as desired.
This can be rectified by making all y_var values unique (adding group name to the value), however this is not desirable because this changes y axis labels (shown below)
dummy_data %>%
mutate(y_var = str_c(y_var, group_var),
y_var = factor(y_var, levels = y_var)) %>%
ggplot() +
geom_point(aes(x = x_var, y = y_var, color = group_var), size = 5) +
facet_wrap(~group_var, scales = 'free', dir = 'v')
I would like to produce the second graph, without having to change the value of all y_var. Thanks!
tidytext::reorder_within() does something similar, and in combination with tidytext::scale_y_reordered() helps with tidying the output to look like your goal.
library(tidytext)
dummy_data %>%
mutate(y_var = reorder_within(y_var, x_var, group_var)) %>%
ggplot() +
geom_point(aes(x = x_var, y = y_var, color = group_var), size = 5) +
scale_y_reordered() +
facet_wrap(~group_var, scales = 'free', dir = 'v')

Specifying factor correctly to prevent ggplot legend from ordering alphabetically (I know this has been asked before)

I know this has been asked many times, which makes me a feel a bit better as to why I am confused. I am trying to keep the color order consistent with the variable selected for plotting and consistent in the legend.
I looked here and here for what I hoped would solve my problem, but it was either a different issue, or I must have either misunderstood or am making a mistake that I can't see.
I specified the order of the factor variable, Group, but the colors change with the alphabetical order as opposed to the factor order. Since I specify the Group as a factor and specify the level order ("B" is always first as "group_1") I would have thought the color would have been consistent for "B" but it changes.
If someone could let me know what I am missing I would greatly appreciate it!
Reproducible example
library(tidyverse)
# Sample data
Group <- c("A", "B", "C")
Value <- c(3, 3, 5)
# Create data frame
mydata <- data.frame(Group, Value)
# Create variable for group selected for plotting
group_1 <- "B"
group_2 <- "A"
# Make a pyramid plot, making one group negative numbers for bar chart
pyramid <- mydata %>%
filter(Group == group_1 | Group == group_2) %>%
mutate(Value = ifelse(Group == group_2, Value * -1, Value)) %>%
# Thought this would keep color ordering consistent
mutate(Group = factor(Group, levels = c(group_1, group_2)))
pyramid_plot <- ggplot(pyramid, aes(x = Group,
y = Value,
fill = Group)
) +
geom_bar(data = subset(pyramid, Group == group_1),
stat = "identity"
) +
geom_bar(data = subset(pyramid, Group == group_2),
stat = "identity"
) +
coord_flip() +
scale_fill_manual(name = "Group",
values = c("#1f78b4", "#33a02c"), # blue, green
breaks = c(group_1, group_2),
labels = c(group_1, group_2)
)
pyramid_plot
# Now do another plot, keeping group_1 the same but changing group_2
group_1 <- "B"
group_2 <- "C"
pyramid <- mydata %>%
filter(Group == group_1 | Group == group_2) %>%
mutate(Value = ifelse(Group == group_2, Value * -1, Value)) %>%
# Thought this would keep color ordering consistent
mutate(Group = factor(Group, levels = c(group_1, group_2)))
pyramid_plot2 <- ggplot(pyramid, aes(x = Group,
y = Value,
fill = Group)
) +
geom_bar(data = subset(pyramid, Group == group_1),
stat = "identity"
) +
geom_bar(data = subset(pyramid, Group == group_2),
stat = "identity"
) +
coord_flip() +
scale_fill_manual(name = "Group",
values = c("#1f78b4", "#33a02c"), # blue, green
breaks = c(group_1, group_2),
labels = c(group_1, group_2)
)
pyramid_plot2
Group <- c("A", "B", "C")
Value <- c(3, 3, 5)
mydata <- data.frame(Group, Value)
group_1 <- "B"
group_2 <- "A"
pyramid <- mydata %>%
filter(Group == group_1 | Group == group_2) %>%
mutate(Value = ifelse(Group == group_2, Value * -1, Value)) %>%
mutate(Group = factor(Group, levels = c(group_1, group_2)))
ggplot(pyramid, aes(x = Group, y = Value, fill = Group)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(name = "Group",
values = c("#1f78b4", "#33a02c"))
Output:
group_1 <- "B"
group_2 <- "C"
pyramid <- mydata %>%
filter(Group == group_1 | Group == group_2) %>%
mutate(Value = ifelse(Group == group_2, Value * -1, Value)) %>%
mutate(Group = factor(Group, levels = c(group_1, group_2)))
ggplot(pyramid, aes(x = Group, y = Value, fill = Group)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(name = "Group",
values = c("#1f78b4", "#33a02c"))
Second output:
B is always blue.
I think the issue was with calling the geom_bar twice and subsetting the data. Fill does that for you already, so no need to further slice the data.

Sorting factors in multipanel plot in ggplot2 according to the first panel

Is it possible to sort factors in a multipanel plot in ggplot2 according to the first panel? The first panel decides the order and the remaining panels follow that order.
Here is an example:
require(ggplot2)
set.seed(36)
xx<-data.frame(YEAR=rep(c("X","Y"), each=20),
CLONE=rep(c("A","B","C","D","E"), each=4, 2),
TREAT=rep(c("T1","T2","T3","C"), 10),
VALUE=sample(c(1:10), 40, replace=T))
ggplot(xx, aes(x=CLONE, y=VALUE, fill=YEAR)) +
geom_bar(stat="identity", position="dodge") +
facet_wrap(~TREAT)
Which gives me this plot:
Now I would like to sort CLONE based on the VALUE in YEAR X in a descending order (highest to lowest) but only for the Control (C panel). This order should then be maintained for T1, T2, and T3. By looking at the plot above, I want panel C sorted as CLONE C, B or D (both are 5), A and E. This order of CLONE should then be replicated for the remaining panels.
There's no easy way to do this right in ggplot since you have to reorder CLONE by
3 conditions, TREAT, YEAR and VALUE, otherwise forcats::fct_reorder2 could have been an option.
Instead, extract the order of CLONE from the subset of data corresponding to YEAR = "X",
TREAT = "C", and re-define your factor levels for the whole data set based on this subset.
library("ggplot2")
library("dplyr")
set.seed(36)
xx <- data.frame(YEAR = rep(c("X","Y"), each = 20),
CLONE = rep(c("A","B","C","D","E"), each = 4, 2),
TREAT = rep(c("T1","T2","T3","C"), 10),
VALUE = sample(c(1:10), 40, replace = TRUE), stringsAsFactors = FALSE)
clone_order <- xx %>% subset(TREAT == "C" & YEAR == "X") %>%
arrange(-VALUE) %>% select(CLONE) %>% unlist()
xx <- xx %>% mutate(CLONE = factor(CLONE, levels = clone_order))
ggplot(xx, aes(x = CLONE, y = VALUE, fill = YEAR)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~TREAT)
giving

How to create a bar plot and show average Y values

I want to create a bar plot based on the following data:
Station Delay
A 5
B 6
A 4
A 3
B 8
X axis should contain stations "A" and "B", while bars (Y axis) should show average delay per a station.
I tried this, but it does not give a correct result:
barplot(c(data$Station, data$Delay),
main="BARPLOT", xlab="Stations", ylab="Delays",
names.arg=data$Station)
df <- data.frame(Station = c("A", "B", "A", "A", "B"), Delay= c(5, 6, 4, 3, 8))
library(dplyr)
df <- df %>% group_by(Station) %>% summarise(me = mean(Delay))
library(ggplot2)
ggplot(aes(x = Station, y = me), data = df) + geom_bar(stat = "identity")
or directly with stat_summary
ggplot(aes(x = Station, y = Delay), data = df) + stat_summary(fun.y = "mean", geom = "bar")
In base R, you can do:
m_data <- data.frame(data$Station, m_del=ave(data$Delay, data$Station), stringsAsFactors=F)
barplot(unique(m_data)$m_del, names=unique(m_data)$Station, main="BARPLOT", xlab="Stations", ylab="Delays")
Or with the package data.table, you can do:
library(data.table)
m_data <- setDT(data)[, mean(Delay), by=Station]
m_data[, barplot(V1, names=Station, main="BARPLOT", xlab="Stations", ylab="Delays")]

Resources