ggpubr ggbarplot plotted with bizarre xvalues - r

I have the following code using the CSV below
library(ggpubr)
library(ggsci)
df = read.csv2("file.csv", row.names=1)
# Copy df
df2 = df
# Convert the cyl variable to a factor
df2$perc <- as.factor(df2$perc)
# Add the name colums
df2$name <- rownames(df)
ggbarplot(df2, x = "name", y = "perc",
fill = "role", # change fill color by cyl
color = "white", # Set bar border colors to white
palette = "npg", # jco journal color palett. see ?ggpar
sort.val = "asc", # Sort the value in dscending order
sort.by.groups = FALSE, # Don't sort inside each group
x.text.angle = 0, # Rotate vertically x axis texts
rotate = TRUE,
label = TRUE, label.pos = "out",
#label = TRUE, lab.pos = "in", lab.col = "white",
width = 0.5
)
the CSV is :
genes;perc;role
GATA-3;7,9;confirmed in this cancer
CCDC74A;6,8;prognostic in this cancer
LINC00621;6,1;none
POLRMTP1;4,1;none
IGF2BP3;3,2;confirmed in this cancer
which produced this plot
There are two things I don't get here:
1) Why the x-axis tick of each bar correspond to the actual value plotted ? I mean why the x-axis isn't from 0 to 8, and should be in my opinion. I hope I explain correctly.
2) The label value seems unaligned with the y-thick. Am I missing an option here ?

To be honest, I would probably not use ggpubr here. Staying in the ggplot syntax is often safer. And also arguably less code...
(Also, don't use factors in this case, as user teunbrand commented)
Two good options for horizontal bars:
library(tidyverse)
library(ggstance)
library(ggsci)
Option 1 - use coord_flip
ggplot(df2, aes(fct_reorder(genes, perc), perc, fill = role)) +
geom_col() +
geom_text(aes(label = perc), hjust = 0) +
scale_fill_npg() +
coord_flip(ylim = c(0,100)) +
theme_classic() +
theme(legend.position = 'top') +
labs(x = 'gene', y = 'percent')
option 2 - use the ggstance package
I prefer option 2, because using ggstance allows for more flexible combination with other plots
ggplot(df2, aes(perc, fct_reorder(genes, perc), fill = role)) +
geom_colh() +
geom_text(aes(label = perc), hjust = 0)+
scale_fill_npg() +
coord_cartesian(xlim = c(0,100)) +
theme_classic() +
theme(legend.position = 'top')+
labs(x = 'gene', y = 'percent')
Created on 2020-03-27 by the reprex package (v0.3.0)
data
df2 <- read_delim("genes;perc;role
GATA-3;7,9;confirmed in this cancer
CCDC74A;6,8;prognostic in this cancer
LINC00621;6,1;none
POLRMTP1;4,1;none
IGF2BP3;3,2;confirmed in this cancer", ";") %>% rownames_to_column("name")

Related

How to add percentages on top of an histogram when data is grouped

This is not my data (for confidentiality reasons), but I have tried to create a reproducible example using a dataset included in the ggplot2 library. I have an histogram summarizing the value of some variable by group (factor of 2 levels). First, I did not want the counts but proportions of the total, so I used that code:
library(ggplot2)
library(dplyr)
df_example <- diamonds %>% as.data.frame() %>% filter(cut=="Premium" | cut=="Ideal")
ggplot(df_example,aes(x=z,fill=cut)) +
geom_histogram(aes(y=after_stat(width*density)),binwidth=1,center=0.5,col="black") +
facet_wrap(~cut) +
scale_x_continuous(breaks=seq(0,9,by=1)) +
scale_y_continuous(labels=scales::percent_format(accuracy=2,suffix="")) +
scale_fill_manual(values=c("#CC79A7","#009E73")) +
labs(x="Depth (mm)",y="Count") +
theme_bw() + theme(legend.position="none")
It gave me this as a result.
enter image description here
The issue is that I would like to print the numeric percentages on top of the bins and haven't find a way to do so.
As I saw it done for printing counts elsewhere, I attempted to print them using stat_bin(), including the same y and label values as the y in geom_histogram, thinking it would print the right numbers:
ggplot(df_example,aes(x=z,fill=cut)) +
geom_histogram(aes(y=after_stat(width*density)),binwidth=1,center=0.5,col="black") +
stat_bin(aes(y=after_stat(width*density),label=after_stat(width*density*100)),geom="text",vjust=-.5) +
facet_wrap(~cut) +
scale_x_continuous(breaks=seq(0,9,by=1)) +
scale_y_continuous(labels=scales::percent_format(accuracy=2,suffix="")) +
scale_fill_manual(values=c("#CC79A7","#009E73")) +
labs(x="Depth (mm)",y="%") +
theme_bw() + theme(legend.position="none")
However, it does print way more values than there are bins, these values do not appear consistent with what is portrayed by the bar heights and they do not print in respect to vjust=-.5 which would make them appear slightly above the bars.
enter image description here
What am I missing here? I know that if there was no grouping variable/facet_wrap, I could use after_stat(count/sum(count)) instead of after_stat(width*density) and it seems that it would have fixed my issue. But I need the histograms for both groups to appear next to each other. Thanks in advance!
You have to use the same arguments in stat_bin as for the histogram when adding your labels to get same binning for both layers and to align the labels with the bars:
library(ggplot2)
library(dplyr)
df_example <- diamonds %>%
as.data.frame() %>%
filter(cut == "Premium" | cut == "Ideal")
ggplot(df_example, aes(x = z, fill = cut)) +
geom_histogram(aes(y = after_stat(width * density)),
binwidth = 1, center = 0.5, col = "black"
) +
stat_bin(
aes(
y = after_stat(width * density),
label = scales::number(after_stat(width * density), scale = 100, accuracy = 1)
),
geom = "text", binwidth = 1, center = 0.5, vjust = -.25
) +
facet_wrap(~cut) +
scale_x_continuous(breaks = seq(0, 9, by = 1)) +
scale_y_continuous(labels = scales::number_format(scale = 100)) +
scale_fill_manual(values = c("#CC79A7", "#009E73")) +
labs(x = "Depth (mm)", y = "%") +
theme_bw() +
theme(legend.position = "none")

Summarizing data into percentages for side-by-side Bar Charts in R

Below is the code I am having trouble with and its output. The data set is linked at the bottom of the post.
What I am wanting to do is group the StateCodes together with each MSN (opposite of what is showing now in the output).
plotdata <- EnergyData %>%
filter(MSN %in% c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB")) %>%
filter(Year %in% c("2009")) %>%
select(StateCode, MSN, Data) %>%
group_by(StateCode) %>%
mutate(pct = Data/sum(Data),
lbl = scales::percent(pct))
plotdata
This outputs to:
I thought that the group_by function would do that for me but I would like to know if I am missing a key chunk of code?
Once the above chunk runs correctly, I want to create side by side Bar charts by StateCode using the percentages of each of the 5 MSN's.
Here's the code I have so far.
ggplot(EnergyData,
aes(x = factor(StateCode,
levels = c("AZ", "CA", "NM", "TX")),
y = pct,
fill = factor(drv,
levels = c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB"),
labels = c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB")))) +
geom_bar(stat = "identity",
position = "fill") +
scale_y_continuous(breaks = seq(0, 1, .2),
label = pct) +
geom_text(aes(label = lbl),
size = 3,
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette = "Set2") +
labs(y = "Percent",
fill = "MSN",
x = "State",
title = "Renewable Resources by State") +
theme_minimal()
As of now I believe this all has to do with how I create the percentages for the bar charts.
Any assistance would be great. Thank you!
Here's the data I used Energy Data http://www.mathmodels.org/Problems/2018/MCM-C/ProblemCData.xlsx
Here is a version using data.table for the initial filtering, and changes to the plot function that hopefully get you the result you are after:
library(readxl)
library(data.table)
library(ggplot2)
download.file("http://www.mathmodels.org/Problems/2018/MCM-C/ProblemCData.xlsx", "~/ex/ProblemCData.xlsx")
# by default, factor levels will be in alphabetical order, so we do not need to specify that
EnergyData <- data.table(read_xlsx("~/ex/ProblemCData.xlsx"), key="StateCode", stringsAsFactors = TRUE)
# filter by Year and MSN list
plotdata <- EnergyData[as.character(MSN) %chin% c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB") & Year == 2009]
# calculate percentages of Data by StateCode
plotdata[, pct := Data/sum(Data), by = "StateCode"]
# plot using percent format and specified number of breaks
ggplot(plotdata,
aes(x = StateCode,
y = pct,
fill = MSN)) +
geom_bar(stat = "identity",
position = "fill") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), n.breaks = 6) +
scale_fill_brewer(palette = "Set2") +
labs(y = "Percent",
fill = "MSN",
x = "State",
title = "Renewable Resources by State") +
theme_minimal()
Created on 2020-03-20 by the reprex package (v0.3.0)

How can I plot 2 related variables on the same axis using ggplot? [duplicate]

Edit: This question has been marked as duplicated, but the responses here have been tried and did not work because the case in question is a line chart, not a bar chart. Applying those methods produces a chart with 5 lines, 1 for each year - not useful. Did anyone who voted to mark as duplicate actually try those approaches on the sample dataset supplied with this question? If so please post as an answer.
Original Question:
There's a feature in Excel pivot charts which allows multilevel categorical axes.I'm trying to find a way to do the same thing with ggplot (or any other plotting package in R).
Consider the following dataset:
set.seed(1)
df=data.frame(year=rep(2009:2013,each=4),
quarter=rep(c("Q1","Q2","Q3","Q4"),5),
sales=40:59+rnorm(20,sd=5))
If this is imported to an Excel pivot table, it is straightforward to create the following chart:
Note how the x-axis has two levels, one for quarter and one for the grouping variable, year. Are multilevel axes possible with ggplot?
NB: There is a hack with facets that produces something similar, but this is not what I'm looking for.
library(ggplot2)
ggplot(df) +
geom_line(aes(x=quarter,y=sales,group=year))+
facet_grid(.~year,scales="free")
New labels are added using annotate(geom = "text",. Turn off clipping of x axis labels with clip = "off" in coord_cartesian.
Use theme to add extra margins (plot.margin) and remove (element_blank()) x axis text (axis.title.x, axis.text.x) and vertical grid lines (panel.grid.x).
library(ggplot2)
ggplot(data = df, aes(x = interaction(year, quarter, lex.order = TRUE),
y = sales, group = 1)) +
geom_line(colour = "blue") +
annotate(geom = "text", x = seq_len(nrow(df)), y = 34, label = df$quarter, size = 4) +
annotate(geom = "text", x = 2.5 + 4 * (0:4), y = 32, label = unique(df$year), size = 6) +
coord_cartesian(ylim = c(35, 65), expand = FALSE, clip = "off") +
theme_bw() +
theme(plot.margin = unit(c(1, 1, 4, 1), "lines"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
See also the nice answer by #eipi10 here: Axis labels on two lines with nested x variables (year below months)
The suggested code by Henrik does work and helped me a lot! I think the solution has a high value. But please be aware, that there is a small misstake in the first line of the code, which results in a wrong order of the data.
Instead of
... aes(x = interaction(year,quarter), ...
it should be
... aes(x = interaction(quarter,year), ...
The resulting graphic has the data in the right order.
P.S. I suggested an edit (which was rejected until now) and, due to a small lack of reputation, I am not allowed to comment, what I rather would have done.
User Tung had a great answer on this thread
library(tidyverse)
library(lubridate)
library(scales)
set.seed(123)
df <- tibble(
date = as.Date(41000:42000, origin = "1899-12-30"),
value = c(rnorm(500, 5), rnorm(501, 10))
)
# create year column for facet
df <- df %>%
mutate(year = as.factor(year(date)))
p <- ggplot(df, aes(date, value)) +
geom_line() +
geom_vline(xintercept = as.numeric(df$date[yday(df$date) == 1]), color = "grey60") +
scale_x_date(date_labels = "%b",
breaks = pretty_breaks(),
expand = c(0, 0)) +
# switch the facet strip label to the bottom
facet_grid(.~ year, space = 'free_x', scales = 'free_x', switch = 'x') +
labs(x = "") +
theme_classic(base_size = 14, base_family = 'mono') +
theme(panel.grid.minor.x = element_blank()) +
# remove facet spacing on x-direction
theme(panel.spacing.x = unit(0,"line")) +
# switch the facet strip label to outside
# remove background color
theme(strip.placement = 'outside',
strip.background.x = element_blank())
p

R: placing a text with combination of variables over bars in ggplot

Lets draw a bar chart with ggplot2 from the following data (already in a long format). The values of the variable are then placed in the middle of the bars via geom_text() directive.
stuff.dat<-read.csv(text="continent,stuff,num
America,apples,13
America,bananas,13
Europe,apples,30
Europe,bananas,21
total,apples,43
total,bananas,34")
library(ggplot2)
ggplot(stuff.dat, aes(x=continent, y=num,fill=stuff))+geom_col() +
geom_text(position = position_stack(vjust=0.5),
aes(label=num))
Now it is necessary to add on top of the bars the "Apple-Bananas Index", which is defined as f=apples/bananas - just as manually added in the figure. How to program this in ggplot? How it would be possible to add it to the legend as a separate entry?
I think that the easiest way to achieve this is to prepare the data before you create the plot. I define a function abi() that computes the apple-banana-index from stuff.dat given a continent:
abi <- function(cont) {
with(stuff.dat,
num[continent == cont & stuff == "apples"] / num[continent == cont & stuff == "bananas"]
)
}
And then I create a data frame with all the necessary data:
conts <- levels(stuff.dat$continent)
abi_df <- data.frame(continent = conts,
yf = aggregate(num ~ continent, sum, data = stuff.dat)$num + 5,
abi = round(sapply(conts, abi), 1))
Now, I can add that information to the plot:
library(ggplot2)
ggplot(stuff.dat, aes(x = continent, y = num, fill = stuff)) +
geom_col() +
geom_text(position = position_stack(vjust = 0.5), aes(label = num)) +
geom_text(data = abi_df, aes(y = yf, label = paste0("f = ", abi), fill = NA))
Adding fill = NA to the geom_text() is a bit of a hack and leads to a warning. But if fill is not set, plotting will fail with a message that stuff was not found. I also tried to move fill = stuff from ggplot() to geom_col() but this breaks the y⁻coordinate of the text labels inside the bars. There might be a cleaner solution to this, but I haven't found it yet.
Adding the additional legend is, unfortunately, not trivial, because one cannot easily add text outside the plot area. This actually needs two steps: first one adds text using annotation_custom(). Then, you need to turn clipping off to make the text visible (see, e.g., here). This is a possible solution:
p <- ggplot(stuff.dat, aes(x = continent, y = num, fill = stuff)) +
geom_col() +
geom_text(position = position_stack(vjust = 0.5), aes(label = num)) +
geom_text(data = abi_df, aes(y = yf, label = paste0("f = ", abi), fill = NA)) +
guides(size = guide_legend(title = "f: ABI", override.aes = list(fill = 1))) +
annotation_custom(grob = textGrob("f: ABI\n(Apple-\nBanana-\nIndex",
gp = gpar(cex = .8), just = "left"),
xmin = 3.8, xmax = 3.8, ymin = 17, ymax = 17)
# turn off clipping
library(grid)
gt <- ggplot_gtable(ggplot_build(p))
gt$layout$clip[gt$layout$name == "panel"] <- "off"
grid.draw(gt)

ggplot, facet, piechart: placing text in the middle of pie chart slices

I'm trying to produce a facetted pie-chart with ggplot and facing problems with placing text in the middle of each slice:
dat = read.table(text = "Channel Volume Cnt
AGENT high 8344
AGENT medium 5448
AGENT low 23823
KIOSK high 19275
KIOSK medium 13554
KIOSK low 38293", header=TRUE)
vis = ggplot(data=dat, aes(x=factor(1), y=Cnt, fill=Volume)) +
geom_bar(stat="identity", position="fill") +
coord_polar(theta="y") +
facet_grid(Channel~.) +
geom_text(aes(x=factor(1), y=Cnt, label=Cnt, ymax=Cnt),
position=position_fill(width=1))
The output:
What parameters of geom_text should be adjusted in order to place numerical labels in the middle of piechart slices?
Related question is Pie plot getting its text on top of each other but it doesn't handle case with facet.
UPDATE: following Paul Hiemstra advice and approach in the question above I changed code as follows:
---> pie_text = dat$Cnt/2 + c(0,cumsum(dat$Cnt)[-length(dat$Cnt)])
vis = ggplot(data=dat, aes(x=factor(1), y=Cnt, fill=Volume)) +
geom_bar(stat="identity", position="fill") +
coord_polar(theta="y") +
facet_grid(Channel~.) +
geom_text(aes(x=factor(1),
---> y=pie_text,
label=Cnt, ymax=Cnt), position=position_fill(width=1))
As I expected tweaking text coordiantes is absolute but it needs be within facet data:
NEW ANSWER: With the introduction of ggplot2 v2.2.0, position_stack() can be used to position the labels without the need to calculate a position variable first. The following code will give you the same result as the old answer:
ggplot(data = dat, aes(x = "", y = Cnt, fill = Volume)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Cnt), position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y") +
facet_grid(Channel ~ ., scales = "free")
To remove "hollow" center, adapt the code to:
ggplot(data = dat, aes(x = 0, y = Cnt, fill = Volume)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Cnt), position = position_stack(vjust = 0.5)) +
scale_x_continuous(expand = c(0,0)) +
coord_polar(theta = "y") +
facet_grid(Channel ~ ., scales = "free")
OLD ANSWER: The solution to this problem is creating a position variable, which can be done quite easily with base R or with the data.table, plyr or dplyr packages:
Step 1: Creating the position variable for each Channel
# with base R
dat$pos <- with(dat, ave(Cnt, Channel, FUN = function(x) cumsum(x) - 0.5*x))
# with the data.table package
library(data.table)
setDT(dat)
dat <- dat[, pos:=cumsum(Cnt)-0.5*Cnt, by="Channel"]
# with the plyr package
library(plyr)
dat <- ddply(dat, .(Channel), transform, pos=cumsum(Cnt)-0.5*Cnt)
# with the dplyr package
library(dplyr)
dat <- dat %>% group_by(Channel) %>% mutate(pos=cumsum(Cnt)-0.5*Cnt)
Step 2: Creating the facetted plot
library(ggplot2)
ggplot(data = dat) +
geom_bar(aes(x = "", y = Cnt, fill = Volume), stat = "identity") +
geom_text(aes(x = "", y = pos, label = Cnt)) +
coord_polar(theta = "y") +
facet_grid(Channel ~ ., scales = "free")
The result:
I would like to speak out against the conventional way of making pies in ggplot2, which is to draw a stacked barplot in polar coordinates. While I appreciate the mathematical elegance of that approach, it does cause all sorts of headaches when the plot doesn't look quite the way it's supposed to. In particular, precisely adjusting the size of the pie can be difficult. (If you don't know what I mean, try to make a pie chart that extends all the way to the edge of the plot panel.)
I prefer drawing pies in a normal cartesian coordinate system, using geom_arc_bar() from ggforce. It requires a little bit of extra work on the front end, because we have to calculate angles ourselves, but that's easy and the level of control we get as a result is more than worth it.
I've used this approach in previous answers here and here.
The data (from the question):
dat = read.table(text = "Channel Volume Cnt
AGENT high 8344
AGENT medium 5448
AGENT low 23823
KIOSK high 19275
KIOSK medium 13554
KIOSK low 38293", header=TRUE)
The pie-drawing code:
library(ggplot2)
library(ggforce)
library(dplyr)
# calculate the start and end angles for each pie
dat_pies <- left_join(dat,
dat %>%
group_by(Channel) %>%
summarize(Cnt_total = sum(Cnt))) %>%
group_by(Channel) %>%
mutate(end_angle = 2*pi*cumsum(Cnt)/Cnt_total, # ending angle for each pie slice
start_angle = lag(end_angle, default = 0), # starting angle for each pie slice
mid_angle = 0.5*(start_angle + end_angle)) # middle of each pie slice, for the text label
rpie = 1 # pie radius
rlabel = 0.6 * rpie # radius of the labels; a number slightly larger than 0.5 seems to work better,
# but 0.5 would place it exactly in the middle as the question asks for.
# draw the pies
ggplot(dat_pies) +
geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
start = start_angle, end = end_angle, fill = Volume)) +
geom_text(aes(x = rlabel*sin(mid_angle), y = rlabel*cos(mid_angle), label = Cnt),
hjust = 0.5, vjust = 0.5) +
coord_fixed() +
scale_x_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
facet_grid(Channel~.)
To show why I think this this approach is so much more powerful than the conventional (coord_polar()) approach, let's say we want the labels on the outside of the pie rather than inside. This creates a couple of problems, such as we will have to adjust hjust and vjust depending on the side of the pie a label falls, and also we will have to make the
plot panel wider than high to make space for the labels on the side without generating excessive space above and below. Solving these problems in the polar coordinate approach is not fun, but it's trivial in the cartesian coordinates:
# generate hjust and vjust settings depending on the quadrant into which each
# label falls
dat_pies <- mutate(dat_pies,
hjust = ifelse(mid_angle>pi, 1, 0),
vjust = ifelse(mid_angle<pi/2 | mid_angle>3*pi/2, 0, 1))
rlabel = 1.05 * rpie # now we place labels outside of the pies
ggplot(dat_pies) +
geom_arc_bar(aes(x0 = 0, y0 = 0, r0 = 0, r = rpie,
start = start_angle, end = end_angle, fill = Volume)) +
geom_text(aes(x = rlabel*sin(mid_angle), y = rlabel*cos(mid_angle), label = Cnt,
hjust = hjust, vjust = vjust)) +
coord_fixed() +
scale_x_continuous(limits = c(-1.5, 1.4), name = "", breaks = NULL, labels = NULL) +
scale_y_continuous(limits = c(-1, 1), name = "", breaks = NULL, labels = NULL) +
facet_grid(Channel~.)
To tweak the position of the label text relative to the coordinate, you can use the vjust and hjust arguments of geom_text. This will determine the position of all labels simultaneously, so this might not be what you need.
Alternatively, you could tweak the coordinate of the label. Define a new data.frame where you average the Cnt coordinate (label_x[i] = Cnt[i+1] + Cnt[i]) to position the label in the center of that particular pie. Just pass this new data.frame to geom_text in replacement of the original data.frame.
In addition, piecharts have some visual interpretation flaws. In general I would not use them, especially where good alternatives exist, e.g. a dotplot:
ggplot(dat, aes(x = Cnt, y = Volume)) +
geom_point() +
facet_wrap(~ Channel, ncol = 1)
For example, from this plot it is obvious that Cnt is higher for Kiosk than for Agent, this information is lost in the piechart.
Following answer is partial, clunky and I won't accept it.
The hope is that it will solicit better solution.
text_KIOSK = dat$Cnt
text_AGENT = dat$Cnt
text_KIOSK[dat$Channel=='AGENT'] = 0
text_AGENT[dat$Channel=='KIOSK'] = 0
text_KIOSK = text_KIOSK/1.7 + c(0,cumsum(text_KIOSK)[-length(dat$Cnt)])
text_AGENT = text_AGENT/1.7 + c(0,cumsum(text_AGENT)[-length(dat$Cnt)])
text_KIOSK[dat$Channel=='AGENT'] = 0
text_AGENT[dat$Channel=='KIOSK'] = 0
pie_text = text_KIOSK + text_AGENT
vis = ggplot(data=dat, aes(x=factor(1), y=Cnt, fill=Volume)) +
geom_bar(stat="identity", position=position_fill(width=1)) +
coord_polar(theta="y") +
facet_grid(Channel~.) +
geom_text(aes(y=pie_text, label=format(Cnt,format="d",big.mark=','), ymax=Inf), position=position_fill(width=1))
It produces following chart:
As you noticed I can't move labels for green (low).

Resources