How to create a waterfall plot? - r

I wanted to visualize the bar graph in a additive way (waterfall plot, see below).
This is the data:
structure(list(Parameter = c("Driving", "Driver Behaviour", "Road Quality",
"Passenger load", "Speed", "Topography", "climate", "total"),
Values = c(0.8, 0.2, 0.2, 0.2, 0.24, 0.5, 0.8, 2.82)),
row.names = c(NA, -8L), class = "data.frame")
# Parameter Values
# 1 Driving 0.80
# 2 Driver Behaviour 0.20
# 3 Road Quality 0.20
# 4 Passenger load 0.20
# 5 Speed 0.24
# 6 Topography 0.50
# 7 climate 0.80
# 8 total 2.82
This is the output im trying to produce. Is there any way i can do it in R?

Still in need of some polishing but in principle your watefall chart can be achieved like so:
BTW: Because of rounding errors your data gives a total of 2.94 instead of 2.82.
d <- structure(list(Parameter = c("Driving", "Driver Behaviour", "Road Quality",
"Passenger load", "Speed", "Topography", "climate", "total"),
Values = c(0.8, 0.2, 0.2, 0.2, 0.24, 0.5, 0.8, 2.82)),
row.names = c(NA, -8L), class = "data.frame")
library(ggplot2)
library(dplyr)
# Prepare the dataset
d1 <- d %>%
mutate(ymax = cumsum(Values),
ymin = lag(ymax, default = 0),
xmax = as.numeric(factor(Parameter, levels = Parameter)),
xmin = lag(xmax, default = 0),
x = (xmin + xmax) / 2,
y = (ymin + ymax) / 2,
label = Values,
label_color = "white",
) %>%
# Get the total right
mutate(ymin = ifelse(Parameter == "total", 0, ymin),
ymax = ifelse(Parameter == "total", Values, ymax),
y = ifelse(Parameter %in% c("Driving", "total"), Values + .2, y),
label = case_when(
Parameter %in% c("Driving") ~ paste0("Best Case\n", Values),
Parameter %in% c("total") ~ paste0("Worst Case\n", Values),
TRUE ~ as.character(Values)),
label_color = ifelse(Parameter %in% c("Driving", "total"), "black", "white"))
ggplot(d1, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax)) +
geom_rect(fill = "#E40019") +
geom_text(aes(x = x, y = y, label = label, color = label_color), show.legend = FALSE) +
scale_x_continuous(breaks = seq(.5, 7.5, 1), labels = d1$Parameter) +
scale_color_manual(values = c(white = "white", black = "black")) +
theme_bw()
Created on 2020-06-16 by the reprex package (v0.3.0)

Related

How fill geom_ribbon with different colour in R?

I am trying to use different fill for geom_ribbon according to the x-values (For Temp = 0-20 one fill, 20-30.1 another fill and > 30.1 another fill). I am using the following code
library(tidyverse)
bounds2 <- df %>%
mutate(ymax = pmax(Growth.rate, slope),
ymin = pmin(Growth.rate, slope),
x_bins = cut(Temp, breaks = c(0,20,30.1,max(Temp)+5)))
ggplot(df, aes(x = Temp, y = Growth.rate)) +
geom_line(colour = "blue") +
geom_line(aes(y = slope), colour = "red") +
scale_y_continuous(sec.axis = sec_axis(~ .^1, name = "slope")) +
geom_ribbon(data = bounds2, aes(Temp, ymin = ymin, ymax = ymax, fill = x_bins),
alpha = 0.4)
It is returning me following output
As you can see from the output some regions are remaining empty. Now how can I fill those parts in the curve?
Here is the data
df = structure(list(Temp = c(10, 13, 17, 20, 25, 28, 30, 32, 35, 38
), Growth.rate = c(0, 0.02, 0.19, 0.39, 0.79, 0.96, 1, 0.95,
0.65, 0), slope = c(0, 0.02, 0.16, 0.2, 0.39, 0.1, 0.03, -0.04,
-0.29, -0.65)), row.names = c(NA, 10L), class = "data.frame")
Here's a solution that involves interpolating new points at the boundaries between the areas. I used approx to get the values of ymin and ymax at Temp=30.1 and added this to the plotting dataset.
Then, instead of using cut just once as you did I use it twice, once with lower bounds included in each set then once with upper bounds included. Then I reshape the data long, and de-duplicate the rows I don't need.
If you zoom in enough you can see that the boundary is at 30.1 not at 30.
bounds2 <- df %>%
mutate(ymax = pmax(Growth.rate, slope),
ymin = pmin(Growth.rate, slope))
bounds2 <- bounds2 |>
add_case(Temp=30.1,
ymax=approx(bounds2$Temp,bounds2$ymax,xout = 30.1)$y,
ymin=approx(bounds2$Temp,bounds2$ymin,xout = 30.1)$y) |>
mutate(x_bins2 = cut(Temp, breaks = c(0,20,30.1,max(Temp)+5),right=FALSE, labels=c("0-20","20-30.1","30.1-max")),
x_bins = cut(Temp, breaks = c(0,20,30.1,max(Temp)+5), labels=c("0-20","20-30.1","30.1-max"))) |>
tidyr::pivot_longer(cols=c(x_bins2, x_bins), names_to = NULL, values_to = "xb") |>
distinct()
ggplot(df, aes(x = Temp, y = Growth.rate)) +
geom_line(colour = "blue") +
geom_line(aes(y = slope), colour = "red") +
scale_y_continuous(sec.axis = sec_axis(~ .^1, name = "slope")) +
geom_ribbon(data = bounds2, aes(Temp, ymin = ymin, ymax = ymax, fill = xb),
alpha = 0.4)
The idea is here but the code I show can be much improved at the step ### Dupplicate the 2 last x_bins from each category and move them into the next
### Libraries
library(tidyverse)
df <- structure(list(Temp = c(10, 13, 17, 20, 25, 28, 30, 32, 35, 38
), Growth.rate = c(0, 0.02, 0.19, 0.39, 0.79, 0.96, 1, 0.95,
0.65, 0), slope = c(0, 0.02, 0.16, 0.2, 0.39, 0.1, 0.03, -0.04,
-0.29, -0.65)), row.names = c(NA, 10L), class = "data.frame")
### Preprocessing
bounds2 <- df %>%
mutate(ymax = pmax(Growth.rate, slope),
ymin = pmin(Growth.rate, slope),
x_bins = cut(Temp, breaks = c(0, 20, 30.1, max(Temp)+5)))
### Dupplicate the 2 last x_bins from each category and move them into the next category
bounds2 <- rbind(bounds2, bounds2[c(4, 7), ])
bounds2$x_bins[c(11, 12)] <- bounds2[c(5, 8), ]$x_bins
### Plot
ggplot(df, aes(x = Temp, y = Growth.rate)) +
geom_line(colour = "blue") +
geom_line(aes(y = slope), colour = "red") +
scale_y_continuous(sec.axis = sec_axis(~ .^1, name = "slope")) +
geom_ribbon(data = bounds2, aes(Temp, ymin = ymin, ymax = ymax, fill = x_bins),
alpha = 0.4)

Labeling stack bar chart with percentage

I am trying to label the stack bar chart with percentage and I ended labeling with proportion. Here are my codes:
ggplot(data = GradeSTEM_data, aes(x = Grade, y = percent, fill = STEMFlag, label = sprintf("%.02f", percent))) +
geom_bar(position = "fill", stat = "identity") +
scale_y_continuous(labels = scales::label_percent(accuracy = 1)) +
geom_text(position = position_stack(vjust = 0.5), size = 2)
Here is a potential solution:
# Load libraries
library(tidyverse)
# Create 'fake' data (minimal reproducible example)
stem_data <- data.frame(Grade = rep(c("A", "B", "C", "P", "NP"), 2),
STEMflag = factor(x = c(rep("STEM", 5), rep("NONSTEM", 5)),
levels = c("STEM", "NONSTEM")),
percent = c(0.95, 0.93, 0.90, 0.67, 0.86,
0.05, 0.07, 0.10, 0.33, 0.14))
head(stem_data)
#> Grade STEMflag percent
#> 1 A STEM 0.95
#> 2 B STEM 0.93
#> 3 C STEM 0.90
#> 4 P STEM 0.67
#> 5 NP STEM 0.86
#> 6 A NONSTEM 0.05
# Plot the example data
ggplot(data = stem_data, aes(x = Grade, y = percent, fill = STEMflag,
label = paste(percent * 100, "%", sep = ""))) +
geom_bar(position = "fill", stat = "identity") +
scale_y_continuous(labels = scales::label_percent(accuracy = 1)) +
geom_text(position = position_stack(vjust = 0.5), size = 4)
jared's answer can be further improved:
scales::label_percent(accuracy = 1) can be used to format both kind of labels in geom_text() and scale_y_continuous() consistently and without repeating code
geom_bar(stat = "identity") can be abbreviated by geom_col()
# function to format percent labels
lp <- scales::label_percent(accuracy = 1)
library(ggplot2)
ggplot(data = GradeSTEM_data,
aes(x = Grade, y = percent, fill = STEMflag, label = lp(percent))) +
geom_col(position = "fill") +
scale_y_continuous(labels = lp) +
geom_text(position = position_stack(vjust = 0.5))
Data
GradeSTEM_data <- data.frame(
Grade = factor(rep(c("A", "B", "C", "P", "NP"), 2),
levels = c("A", "B", "C", "P", "NP")),
STEMflag = factor(x = c(rep("STEM", 5), rep("NONSTEM", 5)),
levels = c("STEM", "NONSTEM")),
percent = c(0.95, 0.93, 0.90, 0.67, 0.86,
0.05, 0.07, 0.10, 0.33, 0.14))

Rolling mean function similar to pandas in zoo

I want to create a rolling mean of some geochemical data. Currently, I have data for every 1 mm from 0mm to 45.7 mm and I want to average every 10 mm to create 1 cm averages.
This is the data at the moment but it still is not giving me 1 cm averages. Can someone tell me where I am going wrong? Thanks
wapPbTa<-read.csv("wappbta.csv",header=TRUE)
wapPbTa<-wapPbTa[-c(251:458), ]
library(ggplot2)
library(tidypaleo)
library(zoo)
width <- 10
RM<-rollmean(x = wapPbTa$PbTa, k = width,fill=NA)
##Averaged data
ggplot(wapPbTa, aes(x =RM , y = Depth))+
labs(y = "Depth (cm)")+
geom_lineh(size=1)+
geom_point(size=2)+
theme_classic()+
scale_y_reverse()
## Unaveraged data
ggplot(wapPbTa, aes(x =PbTa , y = Depth))+
labs(y = "Depth (cm)")+
geom_lineh(size=1)+
geom_point(size=2)+
theme_classic()+
scale_y_reverse()
structure(list(Depth = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9), PbTa = c(0.163857678,
0.161569533, 0.086305592, 0, 0.006086142, 0, 0, 0.044096031,
0.050739958, 0.088385995, 0.104100946, 0.133012821, 0, 0.127524872,
0.046368715, 0.02514558, 0.109383676, 0.081979695, 0.0766503,
0.064679583)), row.names = c(NA, 20L), class = "data.frame")
This type of problems generally has to do with reshaping the data. The format should be the long format and the data is in wide format. See this post on how to reshape the data from wide to long format.
library(ggplot2)
library(dplyr)
library(tidyr)
library(tidypaleo)
library(zoo)
width <- 10
wapPbTa$RM <- rollmeanr(x = wapPbTa$PbTa, k = width, fill = NA)
wapPbTa %>%
pivot_longer(cols = -Depth) %>%
ggplot(aes(x = value, y = Depth, colour = name)) +
geom_lineh(size = 1) +
geom_point(size = 2) +
scale_y_reverse() +
scale_colour_manual(
breaks = c("PbTa", "RM"),
values = c("black", "blue")
) +
labs(y = "Depth (cm)") +
theme_classic()

How to align labels in cross bars

I am trying to align three text labels i.e. mean, median and current value outside the crossbars.I appreciate any help.
My Data
structure(list(variable = structure(1:10, .Label = c("GrossNetEquity",
"GrossTotalEquityPerfAttr", "LongNetEquity", "LongTotalEquity",
"NetEquity", "NetEquityPerfAttr", "NetTotalEquity", "ShortNetEquity",
"ShortTotalEquity", "TotalNetEquity"), class = "factor"), mx = c(134.5,
8.1, 95.6, 106.4, 61, 6.8, 71.6, -21.4, -24.9, 148.7), mn = c(71.1,
-4.6, 49.7, 66.2, 27, -4.1, 36.4, -46.3, -47.4, 96), avg = c(112.173148148148,
1.14814814814815, 77.7388888888889, 84.5111111111111, 43.262037037037,
1.05092592592593, 48.0694444444444, -34.4194444444444, -36.4416666666667,
120.952777777778), sd = c(14.5968093202928, 2.39877232936504,
9.87368667081958, 8.7204382695887, 7.29159953981859, 2.24405738054356,
7.05196278547511, 6.04899711056417, 5.77265751334298, 13.0003483658092
), md = c(114.15, 1.4, 77.35, 82.65, 41.45, 1.25, 46.35, -34.1,
-35.55, 119.75), firstldiff = c(82.9795295075625, -3.64939651058193,
57.9915155472497, 67.0702345719337, 28.6788379573998, -3.4371888351612,
33.9655188734942, -46.5174386655728, -47.9869816933526, 94.9520810461593
), firstlsum = c(141.366766788734, 5.94569280687823, 97.4862622305281,
101.951987650289, 57.8452361166742, 5.53904068701305, 62.1733700153947,
-22.3214502233161, -24.8963516399807, 146.953474509396), secldiff = c(68.3827201872697,
-6.04816883994697, 48.1178288764302, 58.349796302345, 21.3872384175813,
-5.68124621570476, 26.9135560880191, -52.566435776137, -53.7596392066956,
81.9517326803501), seclsum = c(155.963576109027, 8.34446513624327,
107.359948901348, 110.672425919877, 65.1368356564928, 7.78309806755661,
69.2253328008698, -16.2724531127519, -19.1236941266377, 159.953822875205
), value = c(116.1, -1.2, 88, 92.3, 58.8, -1.2, 63, -28.1, -29.3,
121.6), Criteria = c(NA, NA, "", "", "orange", "", "orange",
"orange", "", "orange")), .Names = c("variable", "mx", "mn",
"avg", "sd", "md", "firstldiff", "firstlsum", "secldiff", "seclsum",
"value", "Criteria"), row.names = c(NA, -10L), class = "data.frame")
My Code
I am trying to show Mean, Median and Current Value in the form of bars on geom_crossbar.But finding it hard to align it.
ggplot(df3,aes(variable,mn))+
geom_crossbar(aes(ymin = mn, ymax = mx,fill = Criteria),
width = 0.5,alpha = 0.50,position =position_dodge())+
geom_point(data=df3, aes(x=variable,y=md,group=1),
shape = "|", size = 10,color ="brown1")+
geom_text(data=df3, aes(x=variable, y=md, label = paste("Median",md)),
size = 3, vjust = 2,hjust = -1.0,color = "brown1",
position = position_dodge(width=0.9))+
geom_point(data=df3, aes(x=variable,y=avg,group=1),
shape = "|", size = 10,color = "coral4")+
geom_text(data=df3, aes(x=variable, y=avg, label = paste("Mean",mn)),
size = 3, vjust = 2.5, hjust = -1.0,color ="coral4")+
geom_point(data=df3, aes(x=variable,y=value,group=1),
shape = "|", size = 10,color ="brown1")+
geom_text(data=df3,aes(x=variable, y=value,label = paste("Current Value",value)),
size = 2, vjust = 3, hjust = -1.0,color = "brown1")+
coord_flip()
If you wish to align your geom_text layers, you can assign them the same y value. I've included an example below. I also removed some repetitive parts from your code, where the different layers can inherit the data / aesthetic mappings from the top ggplot() level.
ggplot(df3, aes(variable, mx))+
geom_crossbar(aes(ymin = mn, ymax = mx, fill = Criteria),
width = 0.5, alpha = 0.50, position = position_dodge()) +
# vertical bars
geom_point(aes(y = md), shape = "|", size = 10, color ="brown1") +
geom_point(aes(y = avg), shape = "|", size = 10, color = "coral4") +
geom_point(aes(y = value), shape = "|", size = 10, color ="brown1") +
# labels (vjust used to move the three layers vertically away from one another;
# nudge_y used to shift them uniformly rightwards)
# note that the original label for "Mean" used paste("Mean", mn), but that didn't
# look right to me, since the vertical bar above used avg instead of mn, & mn appears
# to correspond to "min", not "mean".
geom_text(aes(label = paste("Median", md)),
size = 3, vjust = -1, nudge_y = 5, hjust = 0, color = "brown1") +
geom_text(aes(label = paste("Mean", avg)),
size = 3, vjust = 0, nudge_y = 5, hjust = 0, color ="coral4") +
geom_text(aes(label = paste("Current Value", value)),
size = 2, vjust = 1, nudge_y = 5, hjust = 0, color = "brown1") +
coord_flip() +
expand_limits(y = 200) # expand rightwards to give more space for labels
Note: The above follows the approach in your code, which repeats the same geom layers for different columns in the wide format data. In general, ggplot prefers to deal with data in long format. It looks cleaner, and would be easier to maintain as you only need to make changes (e.g. increase font size, change number of decimal places in the label) once, rather than repeat the change for every affected layer. A long format approach to this problem could look like this:
# create long format data frame for labels
df3.labels <- df3 %>%
select(variable, mx, md, avg, value) %>%
tidyr::gather(type, value, -variable, -mx) %>%
mutate(label = paste0(case_when(type == "md" ~ "Median",
type == "avg" ~ "Mean",
TRUE ~ "Current Value"),
": ",
round(value, 2)),
vjust = case_when(type == "md" ~ -1,
type == "avg" ~ 0,
TRUE ~ 1))
# place df3.labels in the top level call, since there are two geom layers that
# use it as the data source, & only one that uses df3.
ggplot(df3.labels,
aes(x = variable, y = value, color = type, label = label)) +
geom_crossbar(data = df3,
aes(x = variable, y = mn, ymin = mn, ymax = mx, fill = Criteria),
inherit.aes = FALSE,
width = 0.5, alpha = 0.50) +
geom_point(shape = "|", size = 10) +
geom_text(aes(y = mx, vjust = vjust), size = 3, nudge_y = 5, hjust = 0) +
# change colour mappings here
scale_color_manual(values = c("md" = "brown1", "avg" = "coral4", "value" = "brown1"),
guide = FALSE) +
coord_flip() +
expand_limits(y = 200)

Reporting number of NAs in Bar Plot using ggplot and r

Goal
I am attempting to create a series of bar plots that display satisfaction with a recent election in 6 countries. The data include NAs, and I want to report the number of NAs for each country in its respective bar plot.
Is there a way to have ggplot / R automatically insert the number of NA values in the bar plots? I cannot seem to figure this out on my own. I have put in place-holder text in each graph: "NA = ##".
Thank you!
Data Structure
The data look like this:
Country Satisfaction
------- ------------
Algeria 1
Algeria 3
Algeria NA
...
Burundi 3
Burundi 2
Burundi 2
...
Cameroon 4
Cameroon NA
Cameroon NA
...
Example Code
countries <- c("Algeria", "Benin", "Botswana", "Burkina Faso", "Burundi", "Cameroon")
countryvar <- rep(countries, each = 30)
satisfaction <- sample(1:5, 180, replace=TRUE, prob=c(0.2, 0.3, 0.35, 0.1, 0.05))
satisfaction[satisfaction==5] <- NA
df <- as.data.frame(cbind(countryvar, satisfaction))
p <- ggplot2::ggplot(data = subset(df, !is.na(satisfaction)), aes(satisfaction))
p +
geom_bar(aes(fill = satisfaction)) +
facet_wrap(~ countryvar, nrow = 2) +
labs(x = "Satisfaction with Recent Election", y = "Counts") +
annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 22.5, ymax = 25, fill = "white", colour = "black") +
annotate("text", x = 3.6, y = 23.1, hjust = 0, vjust = 0, label = "NA = ##") +
scale_fill_manual(values = c("snow3", "seashell3", "snow4", "black")) +
theme_bw()
Here is an approach using geom_text.
Basically count the NA per country
subset(df, is.na(satisfaction))%>%
count(countryvar)
and use that as data for geom_text
library(ggplot2)
library(tidyverse)
p <- ggplot2::ggplot(data = subset(df, !is.na(satisfaction)), aes(satisfaction))
p +
geom_bar(aes(fill = satisfaction)) +
facet_wrap(~ countryvar, nrow = 2) +
labs(x = "Satisfaction with Recent Election", y = "Counts") +
annotate("rect", xmin = 3.5, xmax = 4.5, ymin = 22.5, ymax = 25, fill = "white", colour = "black") +
geom_text(data = subset(df, is.na(satisfaction))%>%
count(countryvar), x = 3.6, y = 23.1, hjust = 0, vjust = 0, aes(label = paste("NA = ", n))) +
scale_fill_manual(values = c("snow3", "seashell3", "snow4", "black")) +
theme_bw()

Resources