Plot factors in order with grouping variable - r

I'm working with R. I have a dataframe that looks like this:
df <- (structure(list(year = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 5L), .Label = c("2013", "2014", "2015", "2016", "2017"),
class = "factor"), user = structure(c(2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L,
3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L, 5L),
.Label = c("John", "Laura", "Liz", "Mark", "Martha"), class = "factor"),
spent = c(56, 64, 69, 38, 93, 70, 29, 94, 56, 76, 48, 17,
74, 67, 100, 29, 16, 23, 10, 51, 72, 35, 77, 83, 17)),
class = "data.frame", row.names = c(NA, -25L)))
I'm trying to generate a histogram with the "spent" variable on the y-axis, the "user" on the x-axis, and a facet for each year. For each year, the users should be ordered based on the "spent" variable.
I tried something like df$user2=factor(df$user, levels = df$user[order(df$year,df$spent)])
But I get an error saying that the 6th factor is duplicated.
Any help is greatly appreciated!
Gerry

What you are describing is a bar plot. A histogram shows the distribution of a single continuous variable (for example hist(rnorm(100)).
Your ordering statement gave an error because each level in a factor variable (each unique value of user in this case) can appear only once in the levels argument. factor allows you to set a new ordering of the unique levels of user. For example, instead of alphabetic ordering, we can do levels=c("Liz","Laura","Mark","John","Martha")). Then df[order(df$user),] will sort the data frame by the new order of user and df[order(df$year, df$user),] will sort by year than user. However, we can't use factor to get a different order of user for each year.
Based on your description, it looks like you want a faceted plot, but with a different x-axis order in each facet. You can do this in ggplot if you create a new variable that sets the x-axis order (I've called this variable r below) and then use the labels argument in scale_x_continuous to get the desired axis labels.
library(tidyverse)
df = df %>%
# Convert year back to numeric
mutate(year = as.numeric(as.character(year))) %>%
# Sort data into the order we want
arrange(year, spent) %>%
# Create a new variable with the desired row order
mutate(r = row_number())
ggplot(df, aes(r, spent)) +
geom_col() +
facet_grid(. ~ year, scale="free_x") +
scale_x_continuous(breaks=df$r, labels=df$user)
The above plot seems confusing due to the user order changing in each facet. Maybe something like this would work better:
ggplot(df, aes(year, spent, colour=user, group=user)) +
geom_line() +
geom_point() +
geom_text(data=df %>% filter(year==min(year)), aes(label=user),
hjust=1, position=position_nudge(x=-0.1), size=3) +
expand_limits(y=0, x=2012.5) +
theme_classic() +
guides(colour=FALSE)

Related

How to prevent R from alphabetically ranking data in ggplot and specify the order in which data is plotted (Data + Code + Graphs provided)?

I'm trying to fix an issue with my GGBalloonPlot graph with regards to how R processes the axis labels.
By default R plots the data using the labels ranked in reverse alphabetical order but to reveal the pattern of the data, the data need to be plotted in a specific order. The only way I've been able to do trick the software is by manually adding a prefix to each label in my .csv table so that R would rank them properly in my output. This is time consuming since I need to manually order the data first before adding the prefix and then plotting.
I would like to input a character vector (or something like that) which would essentially specify the order in which I want to have the data plotted which would reveal the pattern without the need for a prefix in the label name.
I have made some attempts with "scale_y_discrete" without success. I would also like to do the same thing for the X axis since I've had to use the same "trick" to display the columns in the proper non-alphabetical order which offsets the position of the labels. Any idea on how to get GGplot to display my values as seen in the graph without having to "trick" the software since this is quite time consuming ?
Data + Code
#Assign data to "Stack_Overflow_DummyData"
Stack_Overflow_DummyData <- structure(list(Species = structure(c(8L, 3L, 1L, 5L, 6L, 2L,
7L, 4L, 8L, 3L, 1L, 5L, 6L, 2L, 7L, 4L, 8L, 3L, 1L, 5L, 6L, 2L,
7L, 4L, 8L, 3L, 1L, 5L, 6L, 2L, 7L, 4L), .Label = c("Ani", "Cal",
"Can", "Cau", "Fis", "Ort", "Sem", "Zan"), class = "factor"),
Species_prefix = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L,
1L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 7L, 6L, 5L, 4L, 3L,
2L, 1L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("ac.Cau",
"ad.Sem", "af.Cal", "ag.Ort", "as.Fis", "at.Ani", "be.Can",
"bf.Zan"), class = "factor"), Dist = structure(c(2L, 3L,
5L, 2L, 1L, 1L, 4L, 5L, 2L, 3L, 5L, 2L, 1L, 1L, 4L, 5L, 2L,
3L, 5L, 2L, 1L, 1L, 4L, 5L, 2L, 3L, 5L, 2L, 1L, 1L, 4L, 5L
), .Label = c("End", "Ind", "Pan", "Per", "Wid"), class = "factor"),
Region = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Cen", "Col",
"Far", "Nor"), class = "factor"), Region_prefix = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("a.Far", "b.Nor", "c.Cen", "d.Col"), class = "factor"),
Frequency = c(75, 50, 25, 50, 0, 0, 0, 0, 11.1, 22.2, 55.6,
55.6, 11.1, 0, 5.6, 0, 0, 2.7, 36.9, 27.9, 65.8, 54.1, 37.8,
28.8, 0, 0, 0, 3.1, 34.4, 21.9, 78.1, 81.3)), class = "data.frame", row.names = c(NA,
-32L))
# Plot Data With Prefix Trick
library(ggplot2)
library(ggpubr)
# make color base on Dist, size and alpha dependent on Frequency
ggballoonplot(Stack_Overflow_DummyData, x = "Region_prefix", y = "Species_prefix",
size = "Frequency", size.range = c(1, 9), fill = "Dist") +
theme_set(theme_gray() +
theme(legend.key=element_blank())) +
# Sets Grey Theme and removes grey background from legend panel
theme(axis.title = element_blank()) +
# Removes X axis title (Region)
geom_text(aes(label=Frequency), alpha=1.0, size=3, nudge_x = 0.4)
# Add Frequency Values Next to the circles
# Plot Data Without Prefix Trick
library(ggplot2)
library(ggpubr)
# make color base on Dist, size and alpha dependent on Frequency
ggballoonplot(Stack_Overflow_DummyData, x = "Region", y = "Species",
size = "Frequency", size.range = c(1, 9), fill = "Dist") +
theme_set(theme_gray() +
theme(legend.key=element_blank())) +
# Sets Grey Theme and removes grey background from legend panel
theme(axis.title = element_blank()) +
# Removes X axis title (Region)
geom_text(aes(label=Frequency), alpha=1.0, size=3, nudge_x = 0.4)
# Add Frequency Values Next to the circles
Here below are the graphs
Good Graph.
Using the label prefix trick with the visible pattern in the data:
Wrong Graph (R default).
Without the prefix trick when GGplot automatically orders the data/labels and the graph makes no sense:
To sum up, I would like the Good graph output without having to have to previously add a prefix in my labels.
Many Thanks in advance for your help.
For the axis labels I would define a previous function to override the breaks:
shlab <- function(lbl_brk){
sub("^[a-z]+\\.","",lbl_brk) # removes the starts of strings as a. or ab.
}
Then, to change the labels you just have to use scale_x,y_discrete with labels = shlab (if you look at the help of scale_x_discrete you will see that one of the options for labels is A function that takes the breaks as input and returns labels as output).
For the colours would be enough to change them (values) in scale_fill_manual and for the sizes, using guides so:
library(ggplot2)
library(ggpubr)
shlab <- function(lbl_brk){
sub("^[a-z]+\\.","",lbl_brk)
}
ggballoonplot(Stack_Overflow_DummyData, x = "Region_prefix", y = "Species_prefix", size = "Frequency", size.range = c(1, 9), fill = "Dist") +
scale_x_discrete(labels = shlab) +
scale_y_discrete(labels = shlab) +
scale_fill_manual(values = c("green", "blue", "red", "black", "white")) +
guides(fill = guide_legend(override.aes = list(size=8))) +
theme_set(theme_gray() + theme(legend.key=element_blank())) + # Sets Grey Theme and removes grey background from legend panel
theme(axis.title = element_blank()) + # Removes X axis title (Region)
geom_text(aes(label=Frequency), alpha=1.0, size=3, nudge_x = 0.4) # Add Frequency Values Next to the circles
UPDATE:
With the new dataset and vector labels:
library(ggplot2)
library(ggpubr)
# make color base on Dist, size and alpha dependent on Frequency
ggballoonplot(Stack_Overflow_DummyData, x = "Region", y = "Species",
size = "Frequency", size.range = c(1, 9), fill = "Dist") +
scale_y_discrete(limits = c("Cau", "Sem", "Cal", "Ort", "Fis", "Ani", "Can", "Zan")) +
scale_x_discrete(limits = c("Far", "Nor", "Cen", "Col")) +
theme_set(theme_gray() +
theme(legend.key=element_blank())) +
# Sets Grey Theme and removes grey background from legend panel
theme(axis.title = element_blank()) +
# Removes X axis title (Region)
geom_text(aes(label=Frequency), alpha=1.0, size=3, nudge_x = 0.4)

Order of stacked bars ggplot2 - Soil profile

The documentation for bar charts in ggplot2 says (see example 3):
Bar charts are automatically stacked when multiple bars are placed at the same location. The order of the fill is designed to match the legend.
For some reason the second sentence doesn't work for me. Here is an example data set, which represents soil layers above (leaf litter etc.) and below ground (actual soil):
df <- structure(list(horizon = structure(c(5L, 3L, 4L, 2L, 1L, 5L,
3L, 4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L, 5L, 3L,
4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L), .Label = c("A", "B", "F", "H",
"L"), class = "factor"), site = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), value = c(2.75, 0.5, 0.25,
-4.125, -3.375, 3.78125, 1.375, 0.625, -10.6875, -6.34375, 4.28,
2.065, 0.68, -12.1, -10.75, 8.583333333, 4.541666667, 2.166666667,
-10.70833333, -4.25, 7.35, 4, 1.8, -13.95, -5.175, 1.933333333,
1.245833333, 0.641666667, -11.16666667, -2.291666667)), .Names = c("horizon",
"site", "value"), class = "data.frame", row.names = c(NA, -30L
))
Now I try to plot the data by first specifying the order of the soil layer levels (i.e. horizons, from above to below ground):
require(ggplot2); require(dplyr)
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
ggplot(aes(site, value)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")
It works for L, F, H but not for A, B (below ground, i.e. negative values). The reason why it probably doesn't work is that the stacked bars are sorted from largest to smallest by site (for both positive and negative values separately) and then stacked in a top to bottom approach. Is this correct? If that's the case, then for my positive values it was just coincidence that the legend matched the stacked bars I believe.
What I would like to achieve is a stacking of the bars that matches the order (top to bottom) in the legend and hence also the soil profile when looking at it in a cross-sectional view and I am not sure how to approach this.
I did try to change the sorting behaviour in general but it produced the same plot as above:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
arrange(desc(value)) %>%
ggplot(aes(site, value)) + geom_col(aes(fill=horizon)) + labs(y = "Soil depth (cm)")
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
arrange(value) %>%
ggplot(aes(site, value)) + geom_col(aes(fill=horizon)) + labs(y = "Soil depth (cm)")
I probably have to sort positive and negative values separately, that is descending and ascending, respectively?
Sorting in a stacked bar plot is done according to levels of the corresponding factor. The potential problem arises with negative values which are stacked in reverse (from the negative top towards 0). To illustrate to problem lets make all the values negative:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","B","A"))) %>%
ggplot(aes(site, value - 20)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")
A workaround is to specify a different order of levels which will result in the wanted fill order (in this case: levels = c("L","F","H","B","A")) and manually adjust the legend using scale_fill_discrete:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","B","A"))) %>%
ggplot(aes(site, value)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")+
scale_fill_discrete(breaks = c("L","F","H","A","B"))

Reordering factor for plotting using forcats and ggplot2 packages from tidyverse

First of all, thanks^13 to tidyverse. I want the bars in the chart below to follow the same factor levels reordered by forcats::fct_reorder (). Surprisingly, I see different order of levels in the data set when View ()ed as when they are displayed in the chart (see below). The chart should illustrate the number of failed students before and after the bonus marks (I want to sort the bars based on the number of failed students before the bonus).
MWE
ggplot (df) +
geom_bar (aes (forcats::fct_reorder (subject, FailNo, .desc= TRUE), FailNo, fill = forcats::fct_rev (Bonus)), position = 'dodge', stat = 'identity') +
theme (axis.text.x=element_text(angle=45, vjust=1.5, hjust=1.5, size = rel (1.2)))
Data output of dput (df)
structure(list(subject = structure(c(1L, 2L, 5L, 6L, 3L, 7L,
4L, 9L, 10L, 8L, 12L, 11L, 1L, 2L, 5L, 6L, 3L, 7L, 4L, 9L, 10L,
8L, 12L, 11L), .Label = c("CAB_1", "DEM_1", "SSR_2", "RRG_1",
"TTP_1", "TTP_2", "IMM_1", "RRG_2", "DEM_2", "VRR_2", "PRS_2",
"COM_2", "MEB_2", "PHH_1", "PHH_2"), class = "factor"), Bonus = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("After", "Before"), class = "factor"),
FailNo = c(29, 28, 20, 18, 15, 13, 12, 8, 5, 4, 4, 2, 21,
16, 16, 14, 7, 10, 10, 5, 3, 4, 4, 1)), .Names = c("subject",
"Bonus", "FailNo"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-24L))
Bar chart
The issue
According to the table above, SSR_2 var should come in the fifth rank and IMM_1 in the sixth, however in the chart we see these two variables swapping their positions. How to sort it right after tidyverse in this case?
Use factor with unique levels for your x -axis.
ggplot (df) +
geom_bar (aes(factor(forcats::fct_reorder
(subject, FailNo, .desc= TRUE),
levels=unique(subject)),
FailNo,
fill = forcats::fct_rev (Bonus)),
position = 'dodge', stat = 'identity') +
theme(axis.text.x=element_text(angle=45, vjust=1.5, hjust=1.5, size = rel (1.2)))
Edited: #dotorate comment
Sort failNo before the bonus
library(dplyr)
df_before_bonus <- df %>% filter(Bonus == "Before") %>% arrange(desc(FailNo))
Use FailNo before the bonus to create the factor
df$subject <- factor(df$subject, levels = df_before_bonus$subject, ordered = TRUE)
Updated plot
ggplot(df) +
geom_bar(aes (x = subject, y = FailNo, fill = as.factor(Bonus)),
position = 'dodge', stat = 'identity') +
theme (axis.text.x=element_text(angle=45, vjust=1.5, hjust=1.5, size = rel (1.2)))

Center Labels in Filled Bar Chart using geom_text

I am new to ggplot2 (and R) and am trying to make a filled bar chart with labels in each box indicating the percentage composing that block.
Here is an example of my current figure to which I would like to add labels:
##ggplot figure
library(gpplot2)
library(scales)
#specify order I want in plots
ZIU$Affinity=factor(ZIU$Affinity, levels=c("High", "Het", "Low"))
ZIU$Group=factor(ZIU$Group, levels=c("ZUM", "ZUF", "ZIM", "ZIF"))
ggplot(ZIU, aes(x=Group))+
geom_bar(aes(fill=Affinity), position="fill", width=1, color="black")+
scale_y_continuous(labels=percent_format())+
scale_fill_manual("Affinity", values=c("High"="blue", "Het"="lightblue", "Low"="gray"))+
labs(x="Group", y="Percent Genotype within Group")+
ggtitle("Genotype Distribution", "by Group")
I would like to add labels centered in each box with the percentage that box represents
I have tried to add labels using this code, but it keeps producing the error message "Error: geom_text requires the following missing aesthetics: y" but my plot has no y aesthetic, does this mean I cannot use geom_text? (Also, I am not sure if once the y aesthetic issue is resolved, if the remainder of the geom_text statement will accomplish what I desire, centered white labels in each box.)
ggplot(ZIU, aes(x=Group)) +
geom_bar(aes(fill=Affinity), position="fill", width=1, color="black")+
geom_text(aes(label=paste0(sprintf("%.0f", ZIU$Affinity),"%")),
position=position_fill(vjust=0.5), color="white")+
scale_y_continuous(labels=percent_format())+
scale_fill_manual("Affinity", values=c("High"="blue", "Het"="lightblue", "Low"="gray"))+
labs(x="Group", y="Percent Genotype within Group")+
ggtitle("Genotype Distribution", "by Group")
Also if anyone has suggestions for eliminating the NA values that would be appreciated! I tried
geom_bar(aes(fill=na.omit(Affinity)), position="fill", width=1, color="black")
but was getting the error "Error: Aesthetics must be either length 1 or the same as the data (403): fill, x"
dput(sample)
structure(list(Group = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("ZUM", "ZUF", "ZIM", "ZIF"), class = "factor"),
StudyCode = c(1, 2, 3, 4, 5, 6, 20, 21, 22, 23, 143, 144,
145, 191, 192, 193, 194, 195, 196, 197, 10, 24, 25, 26, 27,
28, 71, 72, 73, 74, 274, 275, 276, 277, 278, 279, 280, 290,
291, 292), Affinity = structure(c(3L, 2L, 1L, 2L, 3L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 1L, 1L, 1L, 3L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
3L, 2L, 2L, 2L), .Label = c("High", "Het", "Low"), class = "factor")), .Names = c("Group",
"StudyCode", "Affinity"), row.names = c(NA, 40L), class = c("tbl_df",
"tbl", "data.frame"))
Thank you so much!
The linked examples have a y aesthetic, because the data are pre-summarized, rather than having ggplot do the counting internally. With your data, the analogous approach would be:
library(scales)
library(tidyverse)
# Summarize data to get counts and percentages
ZIU %>% group_by(Group, Affinity) %>%
tally %>%
mutate(percent=n/sum(n)) %>% # Pipe summarized data into ggplot
ggplot(aes(x=Group, y=percent, fill=Affinity)) +
geom_bar(stat="identity", width=1, color="black") +
geom_text(aes(label=paste0(sprintf("%1.1f", percent*100),"%")),
position=position_stack(vjust=0.5), colour="white") +
scale_y_continuous(labels=percent_format()) +
scale_fill_manual("Affinity", values=c("High"="blue", "Het"="lightblue", "Low"="gray")) +
labs(x="Group", y="Percent Genotype within Group") +
ggtitle("Genotype Distribution", "by Group")
Another option would be to use a line plot, which might make the relative values more clear. Assuming the Group values don't form a natural sequence, the lines are just there as a guide for differentiating the Affinity values across different values of Group.
ZIU %>% group_by(Group, Affinity) %>%
tally %>%
mutate(percent=n/sum(n)) %>% # Pipe summarized data into ggplot
ggplot(aes(x=Group, y=percent, colour=Affinity, group=Affinity)) +
geom_line(alpha=0.4) +
geom_text(aes(label=paste0(sprintf("%1.1f", percent*100),"%")), show.legend=FALSE) +
scale_y_continuous(labels=percent_format(), limits=c(0,1)) +
labs(x="Group", y="Percent Genotype within Group") +
ggtitle("Genotype Distribution", "by Group") +
guides(colour=guide_legend(override.aes=list(alpha=1, size=1))) +
theme_classic()

Show only one text value in ggplot2

I'm attempting to limit the text printing to one variable in a bar plot. How can I just label the pink bar 601, 215, 399, 456?
ggplot(df, aes(Var1, value, label=value, fill=Var2)) +
geom_bar(stat="identity", position=position_dodge(width=0.9)) +
geom_text(position=position_dodge(width=0.9))
structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L,
4L, 1L, 2L, 3L, 4L), .Label = c("Zero", "1-30", "31-100", "101+"
), class = "factor"), Var2 = structure(c(1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("Searches", "Contact",
"Accepts"), class = "factor"), value = c(21567, 215, 399, 456,
13638, 99, 205, 171, 5806, 41, 88, 78)), .Names = c("Var1", "Var2",
"value"), row.names = c(NA, -12L), class = "data.frame")
You can do this with an ifelse statement in geom_text. First, remove label=value from the main ggplot2 call. Then, in geom_text add an ifelse condition on the label as shown below. Also, if you're dodging more than one aesthetic, you can save some typing by creating a dodging object.
pd = position_dodge(0.9)
ggplot(df, aes(Var1, value, fill=Var2)) +
geom_bar(stat="identity", position=pd) +
geom_text(position=pd, aes(label=ifelse(Var2=="Searches", value,"")))
If you want the text in the middle of the bar, rather than at the top, you can do:
geom_text(position=pd, aes(label=ifelse(Var2=="Searches", value, ""), y=0.5*value))
You can actually keep the label statement (with the ifelse condition added) in the main ggplot call, but since label only applies to geom_text (or geom_label), I usually keep it with the geom rather than the main call.

Resources