Plotting small multiples using for loop and ggplot - r

I have a dataset as follows:
Unit Group Feature1 Feature2 Feature3 Feature4
1 1 blue x a 12
2 1 yellow y b 15
3 2 green x a 13
4 3 indigo z c 12
5 1 green y b 16
I'd like to create a grid of visualizations (small multiples) where each row is a group, and each column contains proportions of each feature (ie. the table function, table(dataset$feature1)). I have done the following, however, am having a hard time creating a grid of these visualizations while using a for loop. Currently, I get four different images each with four charts. Any ideas on how to turn this into essentially a 4x4 grid of barcharts rather than 4 separate images?
library(gridExtra)
input_max_groups <- 4
for (i in 1:input_max_groups) {
dataset_subset <- subset(dataset, group== i)
feature1_df <- as.data.frame(table(dataset_subset$feature1)/nrow(dataset_subset)*100)
feature1_plot <- feature1_df %>%
ggplot(aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("") +
theme_bw()
feature2_df <- as.data.frame(table(dataset_subset$feature2)/nrow(dataset_subset)*100)
feature2_plot <- feature2_df %>%
ggplot(aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("") +
theme_bw()
feature3_df <- as.data.frame(table(dataset_subset$feature3)/nrow(dataset_subset)*100)
feature3_plot <- feature3_df %>%
ggplot(aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("") +
theme_bw()
feature4_df <- as.data.frame(table(dataset_subset$feature4)/nrow(dataset_subset)*100)
feature4_plot <- feature4_df %>%
ggplot(aes(x=Var1, y=Freq)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("") +
theme_bw()
plot <- grid.arrange(feature1_plot, feature2_plot, feature3_plot, feature4_plot, nrow=4)
}

Do you mean something like this?
You can accomplish small multiples by using facet_wrap() and you can plot percentages instead of counts by using stat = 'count' in geom_bar() and mapping the y-aesthetic to the special variable ..prop.. for proportion. In your case, you'll need to specify the group to get the proper proportion, and use scales = 'free_x' in the facet wrap to get the x-axis for each facet to contain only the variables of interest.
Now your data is wide and ggplot likes long data so you need to pivot the feature columns into rows to make your data longer using pivot_longer
This way you can leave out your loops and grid.arrange and do it in a single ggplot call.
library(tidyverse)
dataset_subset <- tribble(
~"Unit", ~"Group", ~"Feature1", ~"Feature2", ~"Feature3", ~"Feature4",
1, 1, "blue" , "x", "a", "12",
2, 1, "yellow", "y", "b", "15",
3, 2, "green" , "x", "a", "13",
4, 3, "indigo", "z", "c", "12",
5, 1, "green" , "y", "b", "16")
dataset_subset %>%
pivot_longer(contains("Feature")) %>%
ggplot(aes(x = value)) +
geom_bar(aes(y = ..prop.., group = name), stat = "count", fill = "#f68060", alpha =.6, width = .4) +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~name, scales = "free_x")
Created on 2020-05-23 by the reprex package (v0.3.0)

Related

Using specific x axis order and plotting together 4 plots in R

I have been using ggplot geom_boxplot and arranging the x axis according to a specific order with scale_x_discrete resulting in a perfect plot. Then I try to arrange 4 of these plots using ggarrange, but that overrides the order of the groups.
How can I do both? thank you
H_p<-ggplot (DiversData, aes(x=site, y=H, color=site)) +
geom_boxplot()
H_p + scale_x_discrete(limits=c("Achziv", "SdotYam", "Sharon", "Ashdod", "Ashkelon")) +labs(title="Shannon fish diversity", x = "", y = "H", color = "Site") + theme_classic() + theme(legend.position="none")
One of the 4 original plots
Then when I tile them:
library(ggpubr)
require(grid)
ggarrange(p_vis2 + rremove("x.text")+ rremove("xlab"), H_p + rremove("x.text")+ rremove("xlab"), S_p + rremove("x.text")+ rremove("xlab"), J_p + rremove("x.text")+ rremove("xlab"),
labels = c("A", "B", "C", "D"),
ncol = 2, nrow = 2,common.legend = TRUE, legend = "bottom",
align = "hv",
font.label = list(c("Achziv", "SdotYam", "Sharon", "Ashdod", "Ashkelon"),size = 10, color = "black", face = "bold", family = NULL, position = "top") )
The x axis goes back to ABC order
Thank you so much!
First, turn your 'site' column into a character vector (having not seen your existing dataset, I don't know if it's a character or factor, so this step may be redundant).
DiversData$site<- as.character(DiversData$site)
Secondly, turn it back to a factor with the levels in the desired order
DiversData$site<- factor(DiversData$site, levels=c("Achziv", "SdotYam", "Sharon", "Ashdod", "Ashkelon"))
then plot as before:
H_p<-ggplot (DiversData, aes(x=site, y=H, color=site)) +
geom_boxplot()
etc

Keeping unit of measure in facet_wrap while scales="free_y"? [duplicate]

This question already has an answer here:
Setting individual y axis limits with facet wrap NOT with scales free_y
(1 answer)
Closed 4 years ago.
I'm trying to create a facet_wrap() where the unit of measure remains identical across the different plots, while allowing to slide across the y axis.
To clearify with I mean, I have created a dataset df:
library(tidyverse)
df <- tibble(
Year = c(2010,2011,2012,2010,2011,2012),
Category=c("A","A","A","B","B","B"),
Value=c(1.50, 1.70, 1.60, 4.50, 4.60, 4.55)
)
with df, we can create the following plot using facet_wrap:
ggplot(data = df, aes(x=Year, y=Value)) + geom_line() + facet_wrap(.~ Category)
Plot 1
To clarify the differences between both plots, one can use scale = "free_y":
ggplot(data = df, aes(x=Year, y=Value)) + geom_line()
+ facet_wrap(.~ Category, scale="free_y")
Plot 2
Although it's more clear, the scale on the y-axis in plot A isequal to 0.025, while being 0.0125 in B. This could be misleading to someone who's comparing A & B next to each other.
So my question right now is to know whether there exist an elegant way of plotting something like the graph below (with y-scale = 0.025) without having to plot two seperate plots into a grid?
Thanks
Desired result:
Code for the grid:
# Grid
## Plot A
df_A <- df %>%
filter(Category == "A")
plot_A <- ggplot(data = df_A, aes(x=Year, y=Value)) + geom_line() + coord_cartesian(ylim = c(1.5,1.7)) + ggtitle("A")
## Plot B
df_B <- df %>%
filter(Category == "B")
plot_B <- ggplot(data = df_B, aes(x=Year, y=Value)) + geom_line() + coord_cartesian(ylim = c(4.4,4.6)) + ggtitle("B")
grid.arrange(plot_A, plot_B, nrow=1)
Based on the info at Setting individual y axis limits with facet wrap NOT with scales free_y you can you use geom_blank() and manually specified y-limits by Category:
# df from above code
df2 <- tibble(
Category = c("A", "B"),
y_min = c(1.5, 4.4),
y_max = c(1.7, 4.6)
)
df <- full_join(df, df2, by = "Category")
ggplot(data = df, aes(x=Year, y=Value)) + geom_line() +
facet_wrap(.~ Category, scales = "free_y") +
geom_blank(aes(y = y_min)) +
geom_blank(aes(y = y_max))

ggplot Find sum of all groups and plot as line

I have a data that looks like this
Group x y
A 2 30
B 2 21
C 2 22
A 3 15
B 3 18
C 3 5
A 4 14
B 4 29
C 4 46
And create a chart with:
gg <- ggplot(mydata,
aes(x=x, y=y, fill=Group, group=Group))+
geom_line(data =mydata,
aes(x=x, y=y,colour=Group),
stat="identity",
size=1.5)
plot(gg)
I'm trying to add a fourth line that has the sum of A+B+C at every X. I've tried this but it adds 5 lines, not one with a sum. I want a line that would be y=73 when x=2, y=38 when x=3, and y=89 when x=4.
Code:
Group <- c("A", "B", "C","A", "B", "C","A", "B", "C")
x <- c(2,2,2,3,3,3,4,4,4)
y <- c(30,21,22,15,18,5,14,29,46)
mydata <- data.frame(Group,x,y)
gg <- ggplot(mydata,
aes(x=x, y=y, fill=Group, group=Group))+
geom_line(data =mydata,
aes(x=x, y=y,colour=Group),
stat="identity",
size=1.5)
plot(gg)
One way would be to generate a variable that sums all values of y by x via dplyr's group_by and mutate-functions. You can then generate your plot and add a second line geom that will show the x-specific sums.
library(tidyverse)
mydata %>%
group_by(x) %>%
mutate(sum.y = sum(y)) %>%
ggplot(aes(x=x, y=y, color=Group))+
geom_line(size=1.5) +
geom_line(aes(y = sum.y), color = "black")
Note that I changed your code by removing redundant code in the aesthetics, stat = "identity" in geom_line and all of the data = mydata specifications. These are simply not necessary.

Equal bar widths in ggplot2 histogram using facet_wrap()

I have data which looks similar to example data below and I am attempting to draw a histogram of the measurement column faceted on the Genotype column. Ultimately I would like the colours of the bars to be conditional on the Genotype and Condition columns.
Crucially Genotype B individuals were never measured under condition L.
This is what the data looks like:
library(ggplot2)
library(dplyr)
set.seed(123)
DF <- data.frame(Genotype = rep(c("A", "B"), 500),
Condition = sample(c("E", "L"), 1000, replace = T),
Measurment = round(rnorm(500,10,3), 0))
DF <- anti_join(DF, filter(DF, Genotype == "B" & Condition != "E"))'
head(DF)
Genotype Condition Measurment
1 A L 18
2 A L 2
3 B E 18
4 B E 18
5 B E 16
6 B E 16
Now I to specify the colours of the bars I thought it easiest to create a new column of hexcodes such that all individuals of Genotype B are one colour, and individuals of Genotype A are a second colour if measured under Condition E and a third colour if measured under Condition L.
DF <- DF %>% mutate(colr = ifelse(Genotype == "B", "#409ccd",
ifelse(Condition == "E", "#43cd80", "#ffc0cb")))
I can then draw a histogram faceted on the Genotype column like so:
ggplot(data=DF, aes(Measurment, fill = Condition)) +
geom_histogram(aes(y=..count.., fill = colr), position='dodge', binwidth = 1) +
facet_wrap(~Genotype, nrow=2) +
scale_fill_manual(values = c("#409ccd","#ffc0cb","#43cd80")) +
theme(legend.position="none")
and it like like this:
However as you can see the columns for Genotype B are twice the size of Genotype A. How can I shrink the Genotype B to the same size as Genotype A?
I considered adding dummy entries to my data where Genotype B has Condition L entries but the binning function then counts these as Measurements which is misleading. I also have a version of this using geom_bar() but that results in a similar problem. ggplot must have a way of doing this.
Any help appreciated.
something like this maybe?
ggplot(data=DF, aes(Measurment, fill = Condition)) +
geom_histogram(data=subset(DF, Genotype!="B"),aes(y=..count.., fill = colr), position='dodge', binwidth = 1) +
geom_histogram(data=subset(DF, Genotype=="B"),aes(x = Measurment, y=..count.., fill = colr), position=position_nudge(x=0.25), binwidth = 0.5) +
facet_wrap(~Genotype, nrow=2) +
scale_fill_identity() +
theme(legend.position="none")
Do you want something like the following? I assumed by size of the column you meant bar width.
library(grid)
library(gridExtra)
p1 <- ggplot(data=DF[DF$Genotype=='A',], aes(Measurment, fill = Condition)) +
geom_histogram(aes(y=..count.., fill = colr), position='dodge', binwidth = 1) +
scale_fill_manual(values = c("#43cd80","#ffc0cb")) +
theme(legend.position="none")
p2 <- ggplot(data=DF[DF$Genotype=='B',], aes(Measurment, fill = Condition)) +
geom_histogram(aes(y=..count.., fill = colr), binwidth = 0.5, boundary = 1) +
scale_fill_manual(values = c("#409ccd")) +
theme(legend.position="none")
grid.arrange(p1, p2)

R, ggplot: Change linetype within a series

I am using ggplot geom_smooth to plot turnover data of a customer group from previous year against the current year (based on calendar weeks). As the last week is not complete, I would like to use a dashed linetype for the last week. However, I can't figure out how to that. I can either change the linetype for the entire plot or an entire series, but not within a series (depending on the value of x):
To keep it simple, let's just use the following example:
set.seed(42)
frame <- data.frame(series = rep(c('a','b'),50),x = 1:100, y = runif(100))
ggplot(frame,aes(x = x,y = y, group = series, color=series)) +
geom_smooth(size=1.5, se=FALSE)
How would I have to change this to get dashed lines for x >= 75?
The goal would be something like this:
Thx very much for any help!
Edit, 2016-03-05
Of course I fail when trying to use this method on the original plot. The Problem lies with the ribbon, which is calculated using stat_summary and a predefined function. I tried to use use stat_summary on the original data (mdf), and geom_line on the smooth_data. Even when I comment out everything else, I still get "Error: Continuous value supplied to discrete scale". I believe the problem comes from the fact that the original x value (Kalenderwoche) was discrete, whereas the new, smoothed x is continuous. Do I have to somehow transform one into the other? What else could I do?
Here is what I tried (condensed to the essential lines):
quartiles <- function(x) {
x <- na.omit(x) # remove NULL
median <- median(x)
q1 <- quantile(x,0.25)
q3 <- quantile(x,0.75)
data.frame(y = median, ymin = median, ymax = q3)
}
g <- ggplot(mdf, aes(x=Kalenderwoche, y=value, group=variable, colour=variable,fill=variable))+
geom_smooth(size=1.5, method="auto", se=FALSE)
# Take out the data for smooth line
smooth_data <- ggplot_build(g)$data[[1]]
ggplot(mdf, aes(x=Kalenderwoche, y=value, group=variable, colour=variable,fill=variable))+
stat_summary(fun.data = quartiles,geom="ribbon", colour="NA", alpha=0.25)+
geom_line(data=smooth_data, aes(x=x, y=y, group=group, colour=group, fill=group))
mdf looks like this:
str(mdf)
'data.frame': 280086 obs. of 5 variables:
$ konto_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ Kalenderwoche: Factor w/ 14 levels "2015-48","2015-49",..: 4 12 1 3 7 13 10 6 5 9 ...
$ variable : Factor w/ 2 levels "Umsatz","Umsatz Vorjahr": 1 1 1 1 1 1 1 1 1 1 ...
$ value : num 0 428.3 97.8 76 793.1 ...
There are many accounts (konto_id), and for each account and calendar week (Kalenderwoche), there is a current turnover value (Umsatz) and a turnover value from last year (Umsatz Vorjahr). I can provide a smaller version of the data.frame and the entire code, if required.
Thx very much for any help!
P.S. I am a total novice in R, so my code probably looks rather stupid to pros, sorry for that :(
Edit, 2016-03-06
I have uploaded a subset of the data (mdf):
mdf
The full code of the original graph is the following (looking somewhat weird with so little data, but that's not the point ;)
library(dtw)
library(reshape2)
library(ggplot2)
library(RODBC)
library(Cairo)
# custom breaks for X axis
breaks.custom <- unique(mdf$Kalenderwoche)[c(TRUE,rep(FALSE,0))]
# function called by stat_summary
quartiles <- function(x) {
x <- na.omit(x)
median <- median(x)
q1 <- quantile(x,0.25)
q3 <- quantile(x,0.75)
data.frame(y = median, ymin = median, ymax = q3)
}
# Positions for guidelines and labels
horizontal.center <- (length(unique(mdf$Kalenderwoche))+1)/2
kw.horizontal.center <- as.vector(sort(unique(mdf$Kalenderwoche))[c(horizontal.center-0.5,horizontal.center+0.5)])
vpos.P75.label <- max(quantile(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[1]],0.75)
,quantile(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[2]],0.75))+10
# use the higher P75 value of the two weeks around the center
vpos.mean.label <- min(mean(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[1]])
,mean(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[2]]))-10
vpos.median.label <- min(median(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[1]])
,median(mdf$value[mdf$Kalenderwoche==kw.horizontal.center[2]]))-10
hpos.vline <- which(as.vector(sort(unique(mdf$Kalenderwoche))=="2016-03"))
# custom colour palette (2 colors)
cbPaletteLine <- c("#DA2626", "#2626DA")
cbPaletteFill <- c("#F0A8A8", "#7C7CE9")
# ggplot
ggplot(mdf, aes(x=Kalenderwoche, y=value, group=variable, colour=variable,fill=variable))+
geom_smooth(size=1.5, method="auto", se=FALSE)+
# SE=FALSE to suppress drawing of the SE of the fit.SE of the data shall be used instead:
stat_summary(fun.data = quartiles,geom="ribbon", colour="NA", alpha=0.25)+
scale_x_discrete(breaks=breaks.custom)+
scale_colour_manual(values=cbPaletteLine)+
scale_fill_manual(values=cbPaletteFill)+
#coord_cartesian(ylim = c(0, 250)) +
theme(legend.title = element_blank(), title = element_text(face="bold", size=12))+
#scale_color_brewer(palette="Dark2")+
labs(title = "Tranche 1", x = "Kalenderwoche", y = "Konto-Umsatz [CHF]")+
geom_vline(xintercept = hpos.vline, linetype=2)+
annotate("text", x=horizontal.center, y=vpos.median.label, label = "Median", size=4)+
annotate("text", x=horizontal.center, y=vpos.mean.label, label= "Mean", size=4)+
annotate("text", x=horizontal.center, y=vpos.P75.label, label = "P75%", size=4)+
theme(axis.text.x=element_text(angle = 90, hjust = 0.5, vjust = 0.5))
Edit, 2016-03-06
The final plot now looks like this (thx, Jason!!)
I am not so sure how to smooth all data and use different line types for subsets by geom_smooth function. My idea is to pull out the data which ggplot used to construct the plot and use geom_line to reproduce it. This was the way I did it:
set.seed(42)
frame <- data.frame(series=rep(c('a','b'), 50),
x = 1:100, y = runif(100))
library(ggplot2)
g <- ggplot(frame, aes(x=x, y=y, color=series)) + geom_smooth(se=FALSE)
# Take out the data for smooth line
smooth_data <- ggplot_build(g)$data[[1]]
ggplot(smooth_data[smooth_data$x <= 76, ], aes(x=x, y=y, color=as.factor(group), group=group)) +
geom_line(size=1.5) +
geom_line(data=smooth_data[smooth_data$x >= 74, ], linetype="dashed", size=1.5) +
scale_color_discrete("Series", breaks=c("1", "2"), labels=c("a", "b"))
You're right. The problem is that you add a continuous x to a discrete x in the original layer. One way to deal with it is to create a lookup table which in this case, it is easy because x is a sequence from 1 to 14. We can transform discrete x by indexing. In your code, it should work if you add:
level <- levels(mdf$Kalenderwoche)
ggplot(mdf, aes(x=Kalenderwoche, y=value, group=variable, colour=variable,fill=variable))+
stat_summary(fun.data = quartiles,geom="ribbon", colour="NA", alpha=0.25) +
geom_line(data=smooth_data, aes(x=level[x], y=y, group=group, colour=as.factor(group), fill=NA))
Here is my attempt for the question:
g <- ggplot(mdf, aes(x=Kalenderwoche, y=value, group=variable, colour=variable,fill=variable)) +
geom_smooth(size=1.5, method="auto", se=FALSE) +
# SE=FALSE to suppress drawing of the SE of the fit.SE of the data shall be used instead:
stat_summary(fun.data = quartiles,geom="ribbon", colour="NA", alpha=0.25)
smooth_data <- ggplot_build(g)$data[[1]]
ribbon_data <- ggplot_build(g)$data[[2]]
# Use them as lookup table
level <- levels(mdf$Kalenderwoche)
clevel <- levels(mdf$variable)
ggplot(smooth_data[smooth_data$x <= 13, ], aes(x=level[x], y=y, group=group, color=as.factor(clevel[group]))) +
geom_line(size=1.5) +
geom_line(data=smooth_data[smooth_data$x >= 13, ], linetype="dashed", size=1.5) +
geom_ribbon(data=ribbon_data,
aes(x=x, ymin=ymin, ymax=ymax, fill=as.factor(clevel[group]), color=NA), alpha=0.25) +
scale_x_discrete(breaks=breaks.custom) +
scale_colour_manual(values=cbPaletteLine) +
scale_fill_manual(values=cbPaletteFill) +
#coord_cartesian(ylim = c(0, 250)) +
theme(legend.title = element_blank(), title = element_text(face="bold", size=12))+
#scale_color_brewer(palette="Dark2")+
labs(title = "Tranche 1", x = "Kalenderwoche", y = "Konto-Umsatz [CHF]")+
geom_vline(xintercept = hpos.vline, linetype=2)+
annotate("text", x=horizontal.center, y=vpos.median.label, label = "Median", size=4)+
annotate("text", x=horizontal.center, y=vpos.mean.label, label= "Mean", size=4)+
annotate("text", x=horizontal.center, y=vpos.P75.label, label = "P75%", size=4)+
theme(axis.text.x=element_text(angle = 90, hjust = 0.5, vjust = 0.5))
Note that the legend has borderline.

Resources