Related
I have a dataset containing y variable as Year and x variables as (A, B, C(%)). I have attached the dataset here.
dput(result)
structure(list(Year = 2008:2021, A = c(4L, 22L, 31L, 48L, 54L,
61L, 49L, 56L, 59L, 85L, 72L, 58L, 92L, 89L), B = c(1L, 2L, 6L,
7L, 14L, 21L, 15L, 27L, 27L, 46L, 41L, 26L, 51L, 62L), C... = c(25,
9.09, 19.35, 14.58, 25.93, 34.43, 30.61, 48.21, 45.76, 54.12,
56.94, 44.83, 55.43, 69.66)), class = "data.frame", row.names = c(NA,
-14L))
The variables A and B will be plotted as stacked bar graph and the C will be plotted as line chart in the same plot. I have generated the plot using excel like below:
How can I create the same plot in R?
You first need to reshape longer, for example with pivot_longer() from tidyr, and then you can use ggplot2 to plot the bars and the line in two separate layers. The fill = argument in the geom_bar(aes()) lets you stratify each bar according to a categorical variable - name is created automatically by pivot_longer().
library(ggplot2)
library(tidyr)
dat |>
pivot_longer(A:B) |>
ggplot(aes(x = Year)) +
geom_bar(stat = "identity", aes(y = value, fill = name)) +
geom_line(aes(y = `C(%)`), size = 2)
Created on 2022-06-09 by the reprex package (v2.0.1)
You're asking for overlaid bars, in which case there's no need to pivot, and you can add separate layers. However I would argue that this could confuse or mislead many people - usually in stacked plots bars are stacked, not overlaid, so thread with caution!
library(ggplot2)
library(tidyr)
dat |>
ggplot(aes(x = Year)) +
geom_bar(stat = "identity", aes(y = A), fill = "lightgreen") +
geom_bar(stat = "identity", aes(y = B), fill = "red", alpha = 0.5) +
geom_line(aes(y = `C(%)`), size = 2) +
labs(y = "", caption = "NB: bars are overlaid, not stacked!")
Created on 2022-06-09 by the reprex package (v2.0.1)
I propose this:
library(data.table)
library(ggplot2)
library(ggthemes)
dt <- fread("dataset.csv")
dt.long <- melt(dt, id.vars = c("Year"))
dt.AB <- dt.long[variable %in% c("A", "B"), ]
dt.C <- copy(dt.long[variable == "C(%)", .(Year, variable, value = value * 3/2)])
ggplot(dt.AB, aes(x = Year, y = value, fill = variable), ) +
geom_bar(stat = "identity") +
geom_line(data=dt.C, colour='red', aes(x = Year, y = value)) +
scale_x_continuous(breaks = pretty(dt.AB$Year,
n = length(unique(dt.AB$Year)))) +
scale_y_continuous(
name = "A&B",
breaks = seq (0, 150, 10),
sec.axis = sec_axis(~.*2/3, name="C(%)", breaks = seq (0, 100, 10))
) + theme_hc() +
scale_fill_manual(values=c("grey70", "grey50", "grey30")) +
theme(
axis.line.y = element_line(colour = 'black', size=0.5,
linetype='solid'))
Given a dataframe as follows:
df <- structure(list(date = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 2L, 3L, 4L, 13L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 14L, 15L, 16L, 25L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
26L, 27L, 28L), .Label = c("2010/1/31", "2010/10/31", "2010/11/30",
"2010/12/31", "2010/2/28", "2010/3/31", "2010/4/30", "2010/5/31",
"2010/6/30", "2010/7/31", "2010/8/31", "2010/9/30", "2011/1/31",
"2011/10/31", "2011/11/30", "2011/12/31", "2011/2/28", "2011/3/31",
"2011/4/30", "2011/5/31", "2011/6/30", "2011/7/31", "2011/8/31",
"2011/9/30", "2012/1/31", "2012/10/31", "2012/11/30", "2012/12/31",
"2012/2/29", "2012/3/31", "2012/4/30", "2012/5/31", "2012/6/30",
"2012/7/31", "2012/8/31", "2012/9/30"), class = "factor"), pct = c(14,
17.9, 17.9, 18.1, 18.2, 18.2, 18.2, 18.2, 18.3, 18.3, 18.4, 18.8,
19.9, 15.8, 16.34, 16.5, 16.6, 16.8, 16.8, 16.9, 17, 17, 17,
18.5, 13.1, 14.7, 14.8, 14.7, 14.5, 14.4, 14.2, 14.1, 14.1, 14.1,
14.2, 14.5), values = c(12718.1, 25052.3, 36374, 47884.4, 60339.5,
72669.4, 84922.2, 97492, 111028.5, 125313.3, 139224.2, 154553.7,
15249, 29018.1, 42921.8, 56570.8, 71267.6, 85832.7, 100240.7,
114945.7, 130810.8, 147357.2, 163486.1, 181225.8, 17222.1, 33668.6,
49318.8, 64921.9, 81636.7, 98221.6, 114536.5, 131195.4, 149422,
168355.8, 186832.5, 207166.7)), class = "data.frame", row.names = c(NA,
-36L))
I have plotted it with the following code:
df$date <- as.Date(df$date, format = "%Y/%m/%d")
df_m <- melt(df, id.vars='date')
df_m_x <- df_m %>%
filter(variable %in% c("values"))
df_m_ratio_x <- df_m %>%
filter(variable %in% c("pct")) %>%
mutate(value = value * 10000)
coeff = 1/10000
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(limits = c(min(df$date), max(df$date)), breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm")
Out:
But as you may notice, the date in the x axis are missaligned by one month in the figure.
How could I solve this problem? Thanks.
The issue appears to be differences in how binning occurs between geom_bar and geom_point when you set the limits manually in scale_x_date. Perhaps omitting that would be acceptable:
library(ggplot2)
library(scales)
coeff = 1/10000
ggplot(data = df, aes(x = as.Date(date, format = "%Y/%m/%d"))) +
geom_bar(aes(y = values), alpha = 0.5, stat = 'identity', fill = "#F8766D") +
geom_point(aes(y = pct * 1/coeff), size = 3, color = "#F8766D") +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(date_breaks= "6 months", date_labels = "%Y-%m", name = "date")
The reason that the bars appear to be "off" is because the bars are actually plotted slightly before the breaks. Here is a blown up version:
An alternative might be to use the yearmon format from the zoo package:
library(zoo)
coeff = 1/10000
ggplot(data = df, aes(x = as.yearmon(date, format = "%Y/%m/%d"))) +
geom_bar(aes(y = values), alpha = 0.5, stat = 'identity', fill = "#F8766D") +
geom_point(aes(y = pct * 1/coeff), size = 3, color = "#F8766D") +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_yearmon(format = "%Y-%m", name = "date")
I am not sure if you have noticed. In OP, limits = c(min(df$date), max(df$date)) might have removed two observations, the first month and the last month on your bar chart.
I generated a marker for month from 1 to 36 over 3 years to show the problem:
df_m_x$month = c(1:36)
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(
limits = c(min(df$date), max(df$date)),
breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm") +
geom_text(data= df_m_x, aes(x = date, y = value, label = month))
Remove limits...,
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(
# limits = c(min(df$date), max(df$date)),
breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm") +
geom_text(data= df_m_x, aes(x = date, y = value, label = month))
I am trying to get the colours of a confusion matrix to correspond to the percent value in the middle of each matrix.
I have tried adjusting the geom_tile section fill to various options of Freq, or percentage, but with no luck.
valid_actualFunc <- as.factor(c(conf$ObsFunc))
valid_predFunc <- as.factor(c(conf$PredFunc))
cfmFunc <- confusionMatrix(valid_actualFunc, valid_predFunc)
ggplotConfusionMatrix <- function(m){
mytitle <- paste("Accuracy", percent_format()(m$overall[1]),
"Kappa", percent_format()(m$overall[2]))
data_c <- mutate(group_by(as.data.frame(m$table), Prediction ),
percentage=percent(Freq/sum(Freq)))
p <-
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq/sum(Freq)), colour = "white") +
scale_fill_gradient(low = "white", high = "red", na.value="white") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(axis.text.x=element_text(angle = -90, hjust = 0),
axis.ticks=element_blank(), legend.position="none") +
ggtitle(mytitle)+
scale_y_discrete(limits = rev(levels(as.factor(valid_predFunc))))
return(p)
}
conf2Func=ggplotConfusionMatrix(cfmFunc)
conf2Func
Currently the fill is not equal to the value in the middle, i.e. a tile with 89% is lighter than one with 70%
As per the comment the return is
dput(head(cfmFunc))
list(positive = NULL, table = structure(c(2331L, 102L, 262L,
52L, 290L, 1986L, 178L, 89L, 495L, 74L, 2966L, 52L, 189L, 58L,
92L, 800L), .Dim = c(4L, 4L), .Dimnames = list(Prediction = c("Algae",
"Hard Coral", "Other", "Other Inv"), Reference = c("Algae", "Hard Coral",
"Other", "Other Inv")), class = "table"), overall = c(Accuracy =
0.807008785942492,
Kappa = 0.730790156424558, AccuracyLower = 0.799141307917932,
AccuracyUpper = 0.814697342402988, AccuracyNull = 0.358126996805112,
AccuracyPValue = 0, McnemarPValue = 6.95780670112837e-62), byClass =
structure(c(0.848562067710229,
0.780967361384192, 0.826874825759688, 0.702370500438982,
0.866006328243225,
0.968687274187073, 0.917249961113703, 0.978258420637603,
0.705295007564297,
0.894594594594595, 0.847913093196112, 0.805639476334341,
0.938012218745343,
0.928553104155977, 0.904725375882172, 0.962429347223761,
0.705295007564297,
0.894594594594595, 0.847913093196112, 0.80563947633434, 0.848562067710229,
0.780967361384192, 0.826874825759688, 0.702370500438982,
0.770323859881031,
0.833928196514802, 0.837261820748059, 0.75046904315197, 0.274261182108626,
0.253893769968051, 0.358126996805112, 0.113718051118211,
0.232727635782748,
0.198282747603834, 0.296126198083067, 0.0798722044728434,
0.329972044728434,
0.221645367412141, 0.349241214057508, 0.0991413738019169,
0.857284197976727,
0.874827317785633, 0.872062393436696, 0.840314460538292), .Dim = c(4L,
11L), .Dimnames = list(c("Class: Algae", "Class: Hard Coral",
"Class: Other", "Class: Other Inv"), c("Sensitivity", "Specificity",
"Pos Pred Value", "Neg Pred Value", "Precision", "Recall", "F1",
"Prevalence", "Detection Rate", "Detection Prevalence", "Balanced
Accuracy"
))), mode = "sens_spec", dots = list())
If you check the structure of your dataset to be plotted str(data_c), you will see that percentage is a character vector, and needs to be converted to numeric to be used as continuous input to the fill gradient.
data_c$percentage.numeric <- as.numeric(gsub("%", "", data_c$percentage))
You can use percentage.numeric for aes fill and percentage for aes label.
ggplot(data = data_c,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = percentage.numeric), colour = "white") +
scale_fill_gradient(low = "white", high = "red", na.value="white") +
geom_text(aes(x = Reference, y = Prediction, label = percentage)) +
theme(axis.text.x=element_text(angle = -90, hjust = 0),
axis.ticks=element_blank(), legend.position="none") +
ggtitle(mytitle)
Note scale_y_discrete(limits = rev(levels(as.factor(valid_predFunc)))) produces an error in your example
Error in as.factor(valid_predFunc) : object 'valid_predFunc' not found
I want to plot a boxplot using ggplot2, and i have more than one facet, each facet has different terms, as follows:
library(ggplot2)
p <- ggplot(
data=Data,
aes(x=trait,y=mean)
)
p <- p+facet_wrap(~SP,scales="free",nrow=1)
p <- p+geom_boxplot(aes(fill = Ref,
lower = mean - sd,
upper = mean + sd,
middle = mean,
ymin = min,
ymax = max,
width=c(rep(0.8/3,3),rep(0.8,9))),
lwd=0.5,
stat="identity")
as showed, the width of box in different facet is not the same, is there any way to adjust all the box at a same scale? I had tried to use facet_grid, it can automatically change the width of facets, but all facets share the same y axis.
Data
Data <- structure(list(SP = structure(c(3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L), .Label = c("Human", "Cattle", "Horse", "Maize"
), class = "factor"), Ref = structure(c(3L, 2L, 1L, 3L, 3L, 3L,
2L, 2L, 2L, 1L, 1L, 1L), .Label = c("LMM", "Half", "Adoptive"
), class = "factor"), trait = structure(c(11L, 11L, 11L, 14L,
13L, 12L, 14L, 13L, 12L, 14L, 13L, 12L), .Label = c("cad", "ht",
"t2d", "bd", "cd", "ra", "t1d", "fpro", "mkg", "scs", "coat colour",
"ywk", "ssk", "gdd"), class = "factor"), min = c(0.324122039,
0.336486555, 0.073152049, 0.895455441, 0.849944623, 0.825248005,
0.890413591, 0.852385351, 0.826470308, 0.889139116, 0.838256672,
0.723753592), max = c(0.665536838, 0.678764774, 0.34033228, 0.919794865,
0.955018001, 0.899903826, 0.913350912, 0.957305688, 0.89843716,
0.911257005, 0.955312678, 0.817489555), mean = c(0.4919168555,
0.5360103372, 0.24320509565, 0.907436221, 0.9057516121, 0.8552899502,
0.9035394117, 0.9068819173, 0.8572309823, 0.90125638965, 0.90217769835,
0.7667208778), sd = c(0.0790133656517775, 0.09704320004497, 0.0767552215753863,
0.00611921020505611, 0.0339614482273291, 0.0199389195311925,
0.00598633573504195, 0.0332634006653858, 0.0196465508521771,
0.00592476494699222, 0.0348144156099722, 0.0271827880539459)), .Names = c("SP",
"Ref", "trait", "min", "max", "mean", "sd"), class = "data.frame", row.names = c(10L,
11L, 12L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L))
While u/z-lin's answer works, there is a far simpler solution. Switch from facet_wrap(...) to use facet_grid(...). With facet_grid, you don't need to specify rows and columns. You are still able to specify scales= (which allows automatic adjustment of axis scales for each facet if wanted), but you can also specify space=, which does the same thing, but with the scaling of the overall facet width. This is what you want. Your function call is now something like this:
ggplot(Data, aes(x = trait, y = mean)) +
geom_boxplot(aes(
fill = Ref, lower = mean-sd, upper = mean+sd, middle = mean,
ymin = min, ymax = max),
lwd = 0.5, stat = "identity") +
facet_grid(. ~ SP, scales = "free", space='free') +
scale_x_discrete(expand = c(0, 0.5)) +
theme_bw()
Some more description of layout of facets can be found here.
As #cdtip mentioned, this does not allow for independent y scales for each facet, which is what the OP asked for initially. Luckily, there is also a simple solution for this, which utilizes facet_row() from the ggforce package:
library(ggforce)
# same as above without facet_grid call..
p <- ggplot(Data, aes(x = trait, y = mean)) +
geom_boxplot(aes(
fill = Ref, lower = mean-sd, upper = mean+sd, middle = mean,
ymin = min, ymax = max),
lwd = 0.5, stat = "identity") +
scale_x_discrete(expand = c(0, 0.5)) +
theme_bw()
p + ggforce::facet_row(vars(SP), scales = 'free', space = 'free')
You can adjust facet widths after converting the ggplot object to a grob:
# create ggplot object (no need to manipulate boxplot width here.
# we'll adjust the facet width directly later)
p <- ggplot(Data,
aes(x = trait, y = mean)) +
geom_boxplot(aes(fill = Ref,
lower = mean - sd,
upper = mean + sd,
middle = mean,
ymin = min,
ymax = max),
lwd = 0.5,
stat = "identity") +
facet_wrap(~ SP, scales = "free", nrow = 1) +
scale_x_discrete(expand = c(0, 0.5)) + # change additive expansion from default 0.6 to 0.5
theme_bw()
# convert ggplot object to grob object
gp <- ggplotGrob(p)
# optional: take a look at the grob object's layout
gtable::gtable_show_layout(gp)
# get gtable columns corresponding to the facets (5 & 9, in this case)
facet.columns <- gp$layout$l[grepl("panel", gp$layout$name)]
# get the number of unique x-axis values per facet (1 & 3, in this case)
x.var <- sapply(ggplot_build(p)$layout$panel_scales_x,
function(l) length(l$range$range))
# change the relative widths of the facet columns based on
# how many unique x-axis values are in each facet
gp$widths[facet.columns] <- gp$widths[facet.columns] * x.var
# plot result
grid::grid.draw(gp)
In general, you can determine the width of a box plot in ggplot like so:
ggplot(data= df, aes(x = `some x`, y = `some y`)) + geom_boxplot(width = `some witdth`)
In your case, you might consider setting the width of all the box plots to the range of x divided by the maximum number of elements (in the leftmost figure).
I'm trying to build a line chart with ggplot2 in which I would like to have 2 lines, each adapted to a different axis. I'm trying the following code (where df4 is my data frame):
p1 = ggplot(df4, mapping = aes(x=taxon, y=cov, group = 1, colour = "Coverage", xlab("Cover"))) +
geom_line() +
labs (x = "Taxon", y = "Coverage") +
geom_line(aes(y=depth, colour = "Depth")) +
theme(axis.text.x = element_text(angle = 75, hjust= 1, vjust = 1)) +
scale_colour_manual(values = c("navyblue", "green4")) +
scale_y_continuous(sec.axis = sec_axis(~./4, name = "Depth"))
With this, I am able to build a chart with 2 y-axis and 2 lines, but both lines are adapted to the primary y-axis (the secondary axis is there, but it's useless). Is there maybe a parameter with which I can ask my data to follow this axis?
Blue line values only go until 1, so they should be adapted to the secondary axis
This is an example of my data:
structure(list(taxon = structure(c(80L, 57L, 74L, 32L, 1L, 3L,
41L, 9L, 70L, 12L), .Label = c("c__Tremellomycetes", "f__Listeriaceae",
"f__Saccharomycetaceae", "g__Escherichia", "g__Klebsiella", "g__Pseudomonas",
"g__Saccharomyces", "g__Salmonella", "g__Staphylococcus", "s__Bacillus_amyloliquefaciens",
"s__Bacillus_phage_phi105", "s__Bacillus_siamensis", "s__Bacillus_sp_JS",
"s__Bacillus_subtilis", "s__Bacillus_vallismortis", "s__Citrobacter_sp_30_2",
"s__Cronobacter_phage_ENT47670", "s__Enterobacter_cancerogenus",
"s__Enterobacteria_phage_BP_4795", "s__Enterobacteria_phage_cdtI",
"s__Enterobacteria_phage_ES18", "s__Enterobacteria_phage_fiAA91_ss",
"s__Enterobacteria_phage_HK629", "s__Enterobacteria_phage_IME10",
"s__Enterobacteria_phage_lambda", "s__Enterobacteria_phage_mEp237",
"s__Enterobacteria_phage_mEp460", "s__Enterobacteria_phage_Min27",
"s__Enterobacteria_phage_P22", "s__Enterobacteria_phage_YYZ_2008",
"s__Enterococcus_faecalis", "s__Enterococcus_gilvus", "s__Enterococcus_phage_phiEf11",
"s__Enterococcus_phage_phiFL1A", "s__Enterococcus_phage_phiFL3A",
"s__Escherichia_coli", "s__Escherichia_phage_HK639", "s__Escherichia_phage_P13374",
"s__Lactobacillus_fermentum", "s__Listeria_innocua", "s__Listeria_ivanovii",
"s__Listeria_marthii", "s__Listeria_monocytogenes", "s__Listeria_phage_2389",
"s__Listeria_phage_A118", "s__Listeria_phage_A500", "s__Paenibacillus_sp_ICGEB2008",
"s__Phage_Gifsy_1", "s__Phage_Gifsy_2", "s__Pseudomonas_aeruginosa",
"s__Pseudomonas_mendocina", "s__Pseudomonas_phage_B3", "s__Pseudomonas_phage_D3",
"s__Pseudomonas_phage_DMS3", "s__Pseudomonas_phage_F10", "s__Pseudomonas_phage_F116",
"s__Pseudomonas_phage_PAJU2", "s__Pseudomonas_phage_Pf1", "s__Pseudomonas_phage_phi297",
"s__Pseudomonas_sp_2_1_26", "s__Pseudomonas_sp_P179", "s__Salmonella_enterica",
"s__Salmonella_phage_Fels_1", "s__Salmonella_phage_Fels_2", "s__Salmonella_phage_SETP13",
"s__Salmonella_phage_ST64B", "s__Shigella_phage_Sf6", "s__Staphylococcus_aureus",
"s__Staphylococcus_phage_42E", "s__Staphylococcus_phage_55",
"s__Staphylococcus_phage_80alpha", "s__Staphylococcus_phage_P954",
"s__Staphylococcus_phage_phi2958PVL", "s__Staphylococcus_phage_phiMR25",
"s__Staphylococcus_phage_phiN315", "s__Staphylococcus_phage_phiNM3",
"s__Staphylococcus_phage_phiPVL_CN125", "s__Staphylococcus_phage_phiPVL108",
"s__Staphylococcus_phage_PT1028", "s__Staphylococcus_phage_StauST398_1",
"s__Staphylococcus_phage_StauST398_3", "s__Staphylococcus_prophage_phiPV83",
"s__Stx2_converting_phage_1717", "s__Stx2_converting_phage_86"
), class = "factor"), cov = c(0.987654320987654, 0.99685534591195,
0.994535519125683, 0.147003745318352, 0.390923694779116, 0.92831541218638,
0.99079754601227, 0.993055555555556, 0.497512437810945, 0.58144695960941
), depth = c(1.68148148148148, 0.99685534591195, 0.994535519125683,
0.147003745318352, 0.390923694779116, 0.92831541218638, 0.99079754601227,
1.34722222222222, 0.497512437810945, 0.58144695960941)), .Names = c("taxon",
"cov", "depth"), row.names = c(40L, 10L, 58L, 44L, 7L, 55L, 29L,
13L, 2L, 53L), class = "data.frame")
You just need to multiply the 'depth' geom_line with 4 :
ggplot(df4, mapping = aes(x=taxon, y=cov, group = 1, colour = "Coverage", xlab("Cover"))) +
geom_line() +
labs (x = "Taxon", y = "Coverage") +
geom_line(aes(y=depth * 4, colour = "Depth")) +
theme(axis.text.x = element_text(angle = 75, hjust= 1, vjust = 1)) +
scale_colour_manual(values = c("navyblue", "green4")) +
scale_y_continuous(sec.axis = sec_axis(~./4, name = "Depth"))