My data is structured as follows:
> Comparison
# A tibble: 12 x 3
round TotalShots Year
<int> <dbl> <dbl>
1 1 70 2021
2 2 68 2021
3 3 76 2021
4 4 73 2021
5 5 66 2021
6 6 70 2021
7 1 115 2020
8 2 106 2020
9 3 75 2020
10 4 73 2020
11 5 82 2020
12 6 84 2020
I can plot this in ggplot2 via:
ggplot(Comparison, aes(x = round, y = TotalShots,
colour = factor(Year), label = TotalShots)) +
geom_line() +
geom_point(size = 14) +
geom_text(colour = "black", size = 5, check_overlap = TRUE)
However, in the plot, I have the label, at Rd3 printing as 76 and not 75. I assume this is because of check_overlap = TRUE however the plot is wrong, as year = 2020 for round = 3 should have the label of 75 and not 76.
Is there any way to please fix this?
You can try using ggrepel library for clarity of labels and to avoid overlapping.
library(ggrepel)
library(ggplot2)
ggplot(Comparison, aes(x = round, y = TotalShots,
colour = factor(Year), label = TotalShots)) +
geom_line() +
geom_point(size = 14) +
geom_label_repel(colour = "black", size = 5, nudge_y = 0.8)
Related
This question already has answers here:
Plotting two variables as lines using ggplot2 on the same graph
(5 answers)
Closed 7 months ago.
I would like to create a line graph that shows how the trend of five air pollutants were during the years 2009 to 2019.
Year
CO2
NO2
O3
PM2.5
2009
30
18
20
30
2010
32
16
22
20
2011
33
16
24
20
2012
32
15
25
22
2013
34
14
27
24
2014
36
14
28
22
2015
38
13
29
20
2016
39
13
30
18
2017
40
12
32
16
2018
44
13
34
15
2019
45
11
38
14
I gave that code but it is a histogram, i would like to have a line graph were all four are in the same plot.
df %>%
ggplot(aes(x = Year, y = n, fill = airpollutants)) +
geom_col() +
facet_wrap(~Year) + ggtitle("trend of airpollutants")
I want this output:
https://cdn.ablebits.com/_img-blog/line-graph/line-graph-excel.png
You could reshape your data from wide to long and colour every airpollutants like this:
df <- read.table(text = "Year CO2 NO2 O3 PM2.5
2009 30 18 20 30
2010 32 16 22 20
2011 33 16 24 20
2012 32 15 25 22
2013 34 14 27 24
2014 36 14 28 22
2015 38 13 29 20
2016 39 13 30 18
2017 40 12 32 16
2018 44 13 34 15
2019 45 11 38 14
", header = TRUE)
library(ggplot2)
library(dplyr)
library(reshape)
df %>%
melt(id = "Year") %>%
mutate(variable = as.factor(variable)) %>%
ggplot(aes(x = Year, y = value, colour = variable)) +
geom_line() +
labs(colour = "airpollutants") +
ggtitle("trend of airpollutants")
Created on 2022-07-26 by the reprex package (v2.0.1)
Usually you'll want to be in long format when plotting in ggplot2.
One way to draw multiple lines without going long is to map over the columns
ggplot(data = df) + purrr::map2(df[-1], names(df[-1]), \(x,y) geom_line(aes(x = df$Year, y = x, col = y))) +
labs(x = "Concentration",
y = "Year",
col = "Pollutant")
set.seed(123)
library(ggplot2)
library(tidyr)
# Example data
df <- data.frame(year = 2009:2019,
CO2 = sample(30:40, 11),
NO2 = sample(10:20, 11),
O3 = sample(20:30, 11),
PM2.5 = sample(15:25, 11))
# Convert to long format
df_long <- pivot_longer(df,
cols = c(CO2, NO2, O3, PM2.5),
values_to = "Concentration",
names_to = "Pollutant")
# Plot
ggplot(df_long,
aes(
x = year,
y = Concentration,
color = Pollutant,
linetype = Pollutant
)) +
geom_line(size = 0.7) +
ggtitle("Trend of Airpollutants") +
xlab("Year") +
ylab("Concentration") +
scale_x_continuous(breaks = seq(2009, 2019, by = 1), limits = c(2009,2019)) +
theme_minimal()
I have my data here in this googledoc
That looks like this:
# A tibble: 57 × 3
date n_sym n_rep
<date> <dbl> <dbl>
1 2020-06-01 153 63
2 2020-06-02 206 168
3 2020-06-03 192 202
4 2020-06-04 168 247
5 2020-06-05 155 211
6 2020-06-06 150 155
7 2020-06-07 100 85
8 2020-06-08 192 125
9 2020-06-09 182 195
10 2020-06-10 198 234
# … with 47 more rows
I would like to create a stacked histogram with daily bins, like something in this figure.
Where: n_sym and n_rep are counts stacked one over each other.
I can't understand how to proceed....
This way you could modify to achieve your desired plot:
library(tidyverse)
library(scales)
df1 <- df %>%
pivot_longer(
-date
) %>%
mutate(date = as.Date(date),
name = ifelse(name=="n_sym", "Onset of symptoms", "Date of reporting"))
ggplot(df1, aes(x=date, y=value, fill=name))+
geom_col()+
xlab("Onset of symptoms, alternatively date of reporting (2020)") +
ylab("Number of reported cases") +
scale_fill_manual(values = c("#ffc000", "#045aa0"))+
scale_x_date(date_breaks = "1 day", labels = date_format("%d/%m")) +
scale_y_continuous(expand = c(0, 0), limits = c(0, max(df1$value)),
breaks=seq(0,max(df1$value),100))+
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
theme(legend.position="bottom")+
guides(fill=guide_legend(title=""))+
coord_fixed(ratio = .05)+
theme(axis.title = element_text(size = 16))
Can someone provide me some hints as to what I am doing wrong in my code? Or what I need to correct to get the correct percentages? I am trying to get the proportions by manipulating my ggplot2 code. I would prefer not mutating a column. However, if I can't get ggplot2 to give me the correct proportions, I will then be open to adding columns.
Here is the reproduceable data:
cat_type<-c("1", "1","2","3","1","3", "3","2","1","1","1","3","3","2","3","2","3","1","3","3","3","1","3","1","3","1","1","3","1")
country<-c("India","India","India","India","India","India","India","India","India","India","Indonesia","Russia","Indonesia","Russia","Russia","Indonesia","Indonesia","Indonesia","Indonesia","Russia","Indonesia","Russia","Indonesia","Indonesia","Russia", "Russia", "India","India","India")
bigcats<-data.frame(cat_type=cat_type,country=country)
My data gives me the following proportions (these are correct):
> table(bigcats$cat_type, bigcats$country) ## raw numbers
India Indonesia Russia
1 7 3 2
2 2 1 1
3 4 5 4
>
> 100*round(prop.table(table(bigcats$cat_type, bigcats$country),2),3) ## proportions by column total
India Indonesia Russia
1 53.8 33.3 28.6
2 15.4 11.1 14.3
3 30.8 55.6 57.1
However, my ggplot2 is giving me the incorrect proportions:
bigcats %>% ggplot(aes(x=country, y = prop.table(stat(count)), fill=cat_type, label = scales::percent(prop.table(stat(count)))))+
geom_bar(position = position_fill())+
geom_text(stat = "count", position = position_fill(vjust=0.5),colour = "white", size = 5)+
labs(y="Percent",title="Top Big Cat Populations",x="Country")+
scale_fill_discrete(name=NULL,labels=c("Siberian/Bengal", "Other wild cats", "Puma/Leopard/Jaguar"))+
scale_y_continuous(labels = scales::percent)
The issue is that using prop.table(stat(count)) will not compute the proportions by categories or your countries, i.e. you do:
library(dplyr)
bigcats %>%
count(cat_type, country) %>%
mutate(pct = scales::percent(prop.table(n)))
#> cat_type country n pct
#> 1 1 India 7 24.1%
#> 2 1 Indonesia 3 10.3%
#> 3 1 Russia 2 6.9%
#> 4 2 India 2 6.9%
#> 5 2 Indonesia 1 3.4%
#> 6 2 Russia 1 3.4%
#> 7 3 India 4 13.8%
#> 8 3 Indonesia 5 17.2%
#> 9 3 Russia 4 13.8%
Making use of a helper function to reduce code duplication you could compute your desired proportions like so:
library(ggplot2)
prop <- function(count, group) {
count / tapply(count, group, sum)[group]
}
ggplot(bigcats, aes(
x = country, y = prop(after_stat(count), after_stat(x)),
fill = cat_type, label = scales::percent(prop(after_stat(count), after_stat(x)))
)) +
geom_bar(position = position_fill()) +
geom_text(stat = "count", position = position_fill(vjust = 0.5), colour = "white", size = 5) +
labs(y = "Percent", title = "Top Big Cat Populations", x = "Country") +
scale_fill_discrete(name = NULL, labels = c("Siberian/Bengal", "Other wild cats", "Puma/Leopard/Jaguar")) +
scale_y_continuous(labels = scales::percent)
Created on 2021-07-28 by the reprex package (v2.0.0)
I have a pretty basic df in which I have calculated the rank-change of values between two timestamps:
value rank_A rank_B group
1 A 1 1 A
2 B 2 3 A
3 C 3 2 B
4 D 4 4 B
5 E 5 8 A
6 F 6 5 C
7 G 7 6 C
8 H 8 7 A
What makes it a bit tricky (for me) is plotting the values on the Y-axis.
ggplot(df_alluvial, aes(y = value, axis1 = rank_A, axis2 = rank_B))+
geom_alluvium(aes(fill = group), width = 1/12)+
...
As of now, I can plot the rank-change and the groups successfully, but they are not linked to my value-names - there are no axis names and I don't know how to add them.
In the end it should look similiar to this:
https://www.reddit.com/r/GraphicalExcellence/comments/4imh5f/alluvial_diagram_population_size_and_rank_of_uk/
Thanks for your advice!
Your update made the question more clear to me.
The y parameter should be a numerical value, and the data should be in 'long' format. I'm not sure how to change your data to fulfill these requirements. Therefore, I create some new data in this example. I have tried to make the data similar to the data in the plot that you have linked to.
Labels and stratum refer to the city-names. You can use geom_text to label the strata.
# Load libraries
library(tidyverse)
library(ggalluvial)
# Create some data
df_alluvial <- tibble(
city = rep(c("London", "Birmingham", "Manchester"), 4),
year = rep(c(1901, 1911, 1921, 1931), each = 3),
size = c(0, 10, 100, 10, 15, 100, 15, 20, 100, 30, 25, 100))
# Notice the data is in long-format
df_alluvial
#> # A tibble: 12 x 3
#> city year size
#> <chr> <dbl> <dbl>
#> 1 London 1901 0
#> 2 Birmingham 1901 10
#> 3 Manchester 1901 100
#> 4 London 1911 10
#> 5 Birmingham 1911 15
#> 6 Manchester 1911 100
#> 7 London 1921 15
#> 8 Birmingham 1921 20
#> 9 Manchester 1921 100
#> 10 London 1931 30
#> 11 Birmingham 1931 25
#> 12 Manchester 1931 100
ggplot(df_alluvial,
aes(x = as.factor(year), stratum = city, alluvium = city,
y = size,
fill = city, label = city))+
geom_stratum(alpha = .5)+
geom_alluvium()+
geom_text(stat = "stratum", size = 3)
If you want to sort the cities based on their size, you can add decreasing = TRUE to all layers in the plot.
ggplot(df_alluvial,
aes(x = as.factor(year), stratum = city, alluvium = city,
y = size,
fill = city, label = city))+
geom_stratum(alpha = .5, decreasing = TRUE)+
geom_alluvium(decreasing = TRUE)+
geom_text(stat = "stratum", size = 3, decreasing = TRUE)
Created on 2019-11-08 by the reprex package (v0.3.0)
I have a data set as below and I have created a graph with below code as suggested in a previous question. What I want to do is order the bars by rankings rather than team names. Is that possible to do in ggplot?
Team Names PLRankingsReverse Grreserve
Liverpool 20 20
Chelsea 19 19
Manchester City 15 18
Arsenal 16 17
Tottenham 18 16
Manchester United 8 15
Everton 10 14
Watford 13 13
Burnley 17 12
Southampton 9 11
WBA 11 10
Stoke 4 9
Bournemouth 12 8
Leicester 7 7
Middlesbrough 14 6
C. Palace 6 5
West Ham 1 4
Hull 3 3
Swansea 5 2
Sunderland 2 1
And here is the code:
alldata <- read.csv("premierleague.csv")
library(ggplot2)
library(reshape2)
alldata <- melt(alldata)
ggplot(alldata, aes(x = Team.Names, y= value, fill = variable), xlab="Team Names") +
geom_bar(stat="identity", width=.5, position = "dodge")
Thanks for the help!
In this case you need to sort your data frame prior to melting and capture the order. You can then use this to set the limit order on scale_x_discrete, or you can factor Team Name in your aes string.
Using factor:
ordr <- order(alldata$`Team Names`, alldata$PLRankingsReverse, decreasing = TRUE)
alldata <- melt(alldata)
ggplot(alldata, aes(x = factor(`Team Name`, ordr), y = value, fill = variable) +
labs(x = "Team Name") +
geom_bar(stat = "identity", width = .5, position = "dodge")
Using scale_x_discrete:
ordr <- alldata$`Team Name`[order(alldata$PLRankingsReverse, decreasing = TRUE)]
alldata <- melt(alldata)
ggplot(alldata, aes(x = `Team Name`, y = value, fill = variable) +
labs(x = "Team Name") +
geom_bar(stat = "identity", width =. 5, position = "dodge") +
scale_x_discrete(limits = ordr)