I have made a data frame consisting of NFL stats. This data frame contains team name, how many yards they have allowed, if this was passing or rushing, and the total of the yards. I want to create a stacked bar chart with the x axis ordered in ascending order. I have used reorder() but for some reason the Panthers values are flipped and appear much higher than the rest (even though they have the lowest total). I have made other graphs this same way with no problems. I tried manually setting the y axis limit but that did not solve my problem. My question is why is this happening and what have I missed?
I am also open on ways to make this more efficient.
library(rvest)
library(data.table)
library(ggplot2)
#Read and organise data
def = read_html("https://www.pro-football-reference.com/years/2021/opp.htm")
defense = html_table(def)
defense = defense[[1]]
colnames(defense) = defense[1,]
defense = defense[c(-1,-34,-35,-36),]
names = c("rank", "team", "games", "points_against", "yards_conceded", "off_plays_faced",
"yards_per_off_play", "TO", "fumbles_recovered", "1stD_faced", "pass_cmp",
"pass_att", "pass_yd_conceded", "pass_td_conceded", "int_recovered",
"net_yd_pass_att", "pass_1stD_against", "rush_att", "rush_yards_conceded",
"rush_td_conceded", "rush_yards_per_att", "rush_1stD_conceded", "pen_against",
"pen_yards_conceded", "pen_1stD", "off_score_percent", "turnover_percent",
"expected_points_conceded")
colnames(defense) = names
# Get passing and rushing totals
passing_def = data.table(defense$team, defense$pass_yd_conceded, rep("pass", 32))
rushing_def = data.table(defense$team, defense$rush_yards_conceded, rep("rush", 32))
sums = rep(rowSums(cbind(as.numeric(passing_def$V2), as.numeric(rushing_def$V2))),2)
# join them and add the total yards
yds_conc = rbind(passing_def, rushing_def)
yds_conc = data.table(yds_conc, sums)
colnames(yds_conc) = c("team", "conceded", "type", "sums")
# Create the plot so that the teams are sorted from left to right by total yards given up
defplot = ggplot(data = yds_conc, aes(fill = type, x = reorder(team, sums),
y = conceded)) +
geom_bar(stat = 'identity') +
scale_fill_manual(name = "Play Type",labels=c("Passing","Rushing"),
values = c("#006aff", "#4fb350")) +
ggtitle('2021 Season Defense: Total Yards Conceded per Play Type') + ylab("Avg Rank") +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 60, vjust = 1, hjust=1),
axis.ticks.y = element_blank(), axis.text.y = element_blank())
defplot```
add this
yds_conc$conceded=as.numeric(yds_conc$conceded)
then run the same script will work
Related
I am trying to visualize time series data. I have set of 5 loggers which are indicating snow movement distance and an environmental variable that possibly has an effect on the snow movement. That is why it is meaningful to graph them together to see if snow movement (detected by the loggers) is influenced by these environmental factors. I have a file containing the weather station data and 5 files for each logger. The weather data is measured every 5 minutes while the loggers measure whenever there is movement! I have managed to visualize them together so far, however, my professor wants me to visualize the loggers as a group by presenting a gray area (instead of 5 lines) that always shows the minimum and maximum value of the loggers at a certain time point. I am using ggplot2. I've tried to make such area by using ribbon geom_ribbon but it is not so straight forward with my dataset. The line are crossing and often the loggers that have the min and max value switch. I don't know if joining them in a single dataset would help but this is also not possible because they don't have the same length. Furthermore, its not like all 5 loggers have measurements at the same time. They log only when there is movement. Here is my code and the graph that it creates. Unfortunately, I am not sure how to reproduce the data. I am more than glad to share it with you somehow.
#install.packages("patchwork")
library(ggplot2)
library(scales)
library(patchwork)
Sys.setlocale(category = "LC_ALL", locale = "english")
startTime <- as.Date("2017-10-01")
endTime <- as.Date("2018-06-30")
start_end <- c(startTime,endTime)
################################################## FALL LINE 1 #########################################################
logger1 <- read.csv("F1_17_18_167.csv",header=TRUE, sep=";")
logger1$date <- as.Date(logger1$Date, "%d.%m.%Y")
logger2 <- read.csv("F1_17_18_186.csv",header=TRUE, sep=";")
logger2$date <- as.Date(logger2$Date, "%d.%m.%Y")
logger3 <- read.csv("F1_17_18_031.csv",header=TRUE, sep=";")
logger3$date <- as.Date(logger3$Date, "%d.%m.%Y")
logger4 <- read.csv("F1_17_18_091.csv",header=TRUE, sep=";")
logger4$date <- as.Date(logger4$Date, "%d.%m.%Y")
logger5 <- read.csv("F1_17_18_294.csv",header=TRUE, sep=";")
logger5$date <- as.Date(logger5$Date, "%d.%m.%Y")
station <- read.csv("aggregates.csv",header=TRUE, sep=",")
station$date <- as.Date(station$Group.1, "%Y-%m-%d")
ggplot()+
geom_line(data = station, aes(x = date, y = Mean_snowheight ,color = "Mean Snowheight"),na.rm = TRUE, size = 1)+
scale_x_date(limits=start_end,breaks=date_breaks("1 month"),labels=date_format("%b %y"))+
scale_y_continuous(limits= c (0,115))
ggplot()+
geom_line(data = logger1, aes(x = date, y = AccuDist, color = "167 (mid-bottom)"),na.rm= TRUE, size = 1)+
geom_line(data = logger2, aes(x = date, y = AccuDist, color = "186 (top-middle)"),na.rm= TRUE, size = 1)+
geom_line(data = logger3, aes(x = date, y = AccuDist, color = "31 (top)"),na.rm= TRUE, size = 1)+
geom_line(data = logger4, aes(x = date, y = AccuDist, color = "91 (bottom)"),na.rm= TRUE, size = 1)+
geom_line(data = logger5, aes(x = date, y = AccuDist, color = "294 (middle)"),na.rm= TRUE, size = 1)+
geom_line(data = station, aes(x = date, y = Mean_snowheight*11.49 ,color = "Mean snowheight"),na.rm = TRUE, size = 1) +
ggtitle("Fall line 1") +
labs(color = "")+
xlab("Season 17/18")+
ylab("Accumulated Distance [mm]")+
scale_x_date(limits=start_end,breaks=date_breaks("1 month"),labels=date_format("%b %y"))+
scale_y_continuous(sec.axis = sec_axis(~./11.49,name = "Mean snowheight [cm]"),limits = c(0,1500))+
scale_color_manual("", guide = "legend",
values = c("167 (mid-bottom)"= "darkorange2",
"186 (top-middle)" = "darkgreen",
"31 (top)" = "red",
"91 (bottom)" = "blue",
"294 (middle)" = "purple",
"Mean snowheight" = "black"))+
theme(legend.position="bottom",
#legend.title = element_blank(),
axis.text.x = element_text(angle = 50, size = 10 , vjust = 0.5),
axis.text.y = element_text(size = 10, vjust = 0.5),
panel.background = element_rect(fill = "gray100"),
plot.background = element_rect(fill = "gray100"),
panel.grid.major = element_line(colour = "lightblue"),
plot.margin = unit(c(1, 1, 1, 1), "cm"),
plot.title = element_text(hjust = 0.5, size = 22))
You can see what graph this code produces:
If you ignore the environmental factor for a second (black line) you are left the accumulated snow movement distance over the winter period for each logger (the colored lines). My aim is to fill the area that is always between the lowest and highest line.
Let me know if I need to upload the data somewehere. This is how it the logger data looks like: data table.
Thanks in advance.
Regards,
Zorin
This is actually a tougher than it seems at first. Your goal as I understand is to fill in the area in your line plot between the "lowest" and the "highest" lines. This is made more difficult by the fact what is the lowest and highest line may change places throughout the plot, so you cannot simply choose to plot between one of the logs and another log. It's also made difficult by the fact that your x axis value is a date, so not all logs collect data on the same date and time.
First of all, I'll be ignoring a bit of your personal aesthetics you added and also removing the line you included for Mean snow height (from the dataframe station) for ease of showing you the solution I have.
Data Preparation
To begin, I noticed that you have included a geom_line() call for each individual logging station dataset (logger1 through logger5). While the method certainly works (and you do it in a way that gives you the solution you desire), it's much better practice to combine all logs into one dataset and this is going to be necessary in order for the solution I'm proposing to work anyway. Luckily, it's pretty simple to do this: just use rbind() to combine the datasets. Critically - you'll need to create a new column for each (called id here) that maintains the identity of the logging station of origin. You can then use that new id column as your color= aesthetic and draw all 5 lines using one geom_line() call.
One small problem I ran into is that your datasets had slightly different column names (some were caps, some were not...). They were all in the same order, so it wasn't too difficult to make them all the same before combining... it just added another step. Finally, I converted the date column to date format.
# create the id column
logger1$id <- 'logger1'
logger2$id <- 'logger2'
logger3$id <- 'logger3'
logger4$id <- 'logger4'
logger5$id <- 'logger5'
# fixing inconsistency in column names
my_column_names <- names(logger1)
names(logger2) <- my_column_names
names(logger3) <- my_column_names
names(logger4) <- my_column_names
names(logger5) <- my_column_names
# make one big df
loggers <- rbind(logger1, logger2, logger3, logger4, logger5)
loggers$date <- as.Date(loggers$date)
You can now recreate the plot in a more simple way:
ggplot(loggers, aes(x=date, y=AccuDist)) + theme_bw() +
geom_line(aes(color=id), size=1)
Finding the Running Minimum and Maximum
In order to create the fill, I'm using geom_ribbon(), which needs aesthetics ymin and ymax. You have to set those first though, and they need to be "running minimum" and the "running maximum", which means they will change as you progress through the data. For this, I'm using two functions shown below min_vect() and max_vect().
# find the "running maximum"
max_vect <- function(ac) {
curr_max <- 0
return_vector <- vector(mode = 'numeric', length=length(ac))
for(i in 1:length(ac)) {
if(ac[i] > curr_max) {
curr_max <- ac[i]
}
return_vector[i] <- curr_max
}
return(return_vector)
}
# find the "running minimum"
min_vect <- function(ac) {
curr_min <- max(ac)
return_vector <- vector(mode = 'numeric', length=length(ac))
for(i in length(ac):1) {
if(ac[i] < curr_min) {
curr_min <- ac[i]
}
return_vector[i] <- curr_min
}
return(return_vector)
}
The idea is that for the maximum, you step through an (ordered) vector and if the number is higher than the previous maximum number, it becomes the new maximum. The same strategy is used for the running minimum, albeit we have to step through the ordered vector in reverse.
In order to apply the functions to create new columns, the dataset needs to be ordered first in order for it to work properly:
# must arrange by date and time first!
loggers <- loggers %>% arrange(date, TIME)
# add your new columns
loggers$min_Accu <- min_vect(loggers$AccuDist)
loggers$max_Accu <- max_vect(loggers$AccuDist)
The Finale
And now, the plot. Basically it's the same, and I'm using geom_ribbon() as described above. For a bonus, I'm also using scale_color_discrete() to set the legend title and labels, just to show you that you can code that in afterwards (and it will still be easier than having separate geom_line() calls.
logger_list <- c('Log 1', 'Log 2', 'Log 3', 'Log 4', 'Log 5')
ggplot(loggers, aes(x=date, y=AccuDist)) +
theme_bw() +
geom_ribbon(aes(ymin=min_Accu, ymax=max_Accu), alpha=0.2) +
geom_line(aes(color=id), size=1) +
scale_color_discrete(name='Log ID Num', labels=logger_list)
I found how to estimate the historical Variance Decomposition for VAR models in R in the below link
Historical Variance Error Decompotision Daniel Ryback
Daniel Ryback presents the result in an excel plot, but I wanted to prepare it with ggplot so I created some lines to get it, nevertheless, the plot I got in ggplot is very different to the one showed by Daniel in Excel. I replicated in excel and got the same result than Daniel so it seems there is an error in the way I am preparing the ggplot. Does anyone have a suggestion to arrive to the excel result?
See below my code
library(vars)
library(ggplot2)
library(reshape2)
this code is run after runing the code developed by Daniel Ryback in the link above to define the HD function
data(Canada)
ab<-VAR(Canada, p = 2, type = "both")
HD <- VARhd(Estimation=ab)
HD[,,1]
ex <- HD[,,1]
ex1 <- as.data.frame(ex) # transforming the HD matrix as data frame #
ex2 <- ex1[3:84,1:4] # taking our the first 2 rows as they are N/As #
colnames(ex2) <- c("Emplyment", "Productivity", "Real Wages", "Unemplyment") # renaming columns #
ex2$Period <- 1:nrow(ex2) # creating an id column #
col_id <- grep("Period", names(ex2)) # setting the new variable as id #
ex3 <- ex2[, c(col_id, (1:ncol(ex2))[-col_id])] # moving id variable to the first column #
molten.ex <- melt(ex3, id = "Period") # melting the data frame #
ggplot(molten.ex, aes(x = Period, y = value, fill = variable)) +
geom_bar(stat = "identity") +
guides(fill = guide_legend(reverse = TRUE))
ggplot version
Excel version
The difference is that ggplot2 is ordering the variable factor and plotting it in a different order than excel. If you reorder the factor before plotting it will put 'unemployment' at the bottom and 'employment' at the top, as in excel:
molten.ex$variable <- factor(molten.ex$variable, levels = c("Unemployment",
"Real Wages",
"Productivity",
"Employment"))
ggplot(molten.ex, aes(x = Period, y = value, fill = variable)) +
geom_bar(stat = "identity", width = 0.6) +
guides(fill = guide_legend(reverse = TRUE)) +
# Making the R plot look more like excel for comparison...
scale_y_continuous(limits = c(-6,8), breaks = seq(-6,8, by = 2)) +
scale_fill_manual(name = NULL,
values = c(Unemployment = "#FFc000", # yellow
`Real Wages` = "#A4A4A4", # grey
Productivity = "#EC7C30", # orange
Employment = "#5E99CE")) + # blue
theme(rect = element_blank(),
panel.grid.major.y = element_line(colour = "#DADADA"),
legend.position = "bottom",
axis.ticks = element_blank(),
axis.title = element_blank(),
legend.key.size = unit(3, "mm"))
Giving:
To roughly match the excel graph in Daniel Ryback's post:
Edit: This question has been marked as duplicated, but the responses here have been tried and did not work because the case in question is a line chart, not a bar chart. Applying those methods produces a chart with 5 lines, 1 for each year - not useful. Did anyone who voted to mark as duplicate actually try those approaches on the sample dataset supplied with this question? If so please post as an answer.
Original Question:
There's a feature in Excel pivot charts which allows multilevel categorical axes.I'm trying to find a way to do the same thing with ggplot (or any other plotting package in R).
Consider the following dataset:
set.seed(1)
df=data.frame(year=rep(2009:2013,each=4),
quarter=rep(c("Q1","Q2","Q3","Q4"),5),
sales=40:59+rnorm(20,sd=5))
If this is imported to an Excel pivot table, it is straightforward to create the following chart:
Note how the x-axis has two levels, one for quarter and one for the grouping variable, year. Are multilevel axes possible with ggplot?
NB: There is a hack with facets that produces something similar, but this is not what I'm looking for.
library(ggplot2)
ggplot(df) +
geom_line(aes(x=quarter,y=sales,group=year))+
facet_grid(.~year,scales="free")
New labels are added using annotate(geom = "text",. Turn off clipping of x axis labels with clip = "off" in coord_cartesian.
Use theme to add extra margins (plot.margin) and remove (element_blank()) x axis text (axis.title.x, axis.text.x) and vertical grid lines (panel.grid.x).
library(ggplot2)
ggplot(data = df, aes(x = interaction(year, quarter, lex.order = TRUE),
y = sales, group = 1)) +
geom_line(colour = "blue") +
annotate(geom = "text", x = seq_len(nrow(df)), y = 34, label = df$quarter, size = 4) +
annotate(geom = "text", x = 2.5 + 4 * (0:4), y = 32, label = unique(df$year), size = 6) +
coord_cartesian(ylim = c(35, 65), expand = FALSE, clip = "off") +
theme_bw() +
theme(plot.margin = unit(c(1, 1, 4, 1), "lines"),
axis.title.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank())
See also the nice answer by #eipi10 here: Axis labels on two lines with nested x variables (year below months)
The suggested code by Henrik does work and helped me a lot! I think the solution has a high value. But please be aware, that there is a small misstake in the first line of the code, which results in a wrong order of the data.
Instead of
... aes(x = interaction(year,quarter), ...
it should be
... aes(x = interaction(quarter,year), ...
The resulting graphic has the data in the right order.
P.S. I suggested an edit (which was rejected until now) and, due to a small lack of reputation, I am not allowed to comment, what I rather would have done.
User Tung had a great answer on this thread
library(tidyverse)
library(lubridate)
library(scales)
set.seed(123)
df <- tibble(
date = as.Date(41000:42000, origin = "1899-12-30"),
value = c(rnorm(500, 5), rnorm(501, 10))
)
# create year column for facet
df <- df %>%
mutate(year = as.factor(year(date)))
p <- ggplot(df, aes(date, value)) +
geom_line() +
geom_vline(xintercept = as.numeric(df$date[yday(df$date) == 1]), color = "grey60") +
scale_x_date(date_labels = "%b",
breaks = pretty_breaks(),
expand = c(0, 0)) +
# switch the facet strip label to the bottom
facet_grid(.~ year, space = 'free_x', scales = 'free_x', switch = 'x') +
labs(x = "") +
theme_classic(base_size = 14, base_family = 'mono') +
theme(panel.grid.minor.x = element_blank()) +
# remove facet spacing on x-direction
theme(panel.spacing.x = unit(0,"line")) +
# switch the facet strip label to outside
# remove background color
theme(strip.placement = 'outside',
strip.background.x = element_blank())
p
I am trying to make a mosaic plot with ggplot2. I am using the bladdercancer data from the HSAUR3 package. I am looking to show the relationship between tumorsize and number, but I am not sure how to weight it. I know that the number in the sample with tumorsizes<=3cm is not the same as those with tumorsize>3cm. How do I incorporate that into my mosaic plot?
Here is what I did without weighting it.
library("ggplot2")
library("ggmosaic")
ggplot(data = bladdercancer, family=poisson()) +
geom_mosaic(aes(weight= 1 , x = product(tumorsize, number),
fill=factor(tumorsize)), na.rm=TRUE) +
labs(x="Number of tumors", title='Number of tumors vs Tumorsize') +
guides(fill=guide_legend(title = "Tumor Size"))
This may be late nevertheless I provide my suggestions since I am trying to do a similar thing. Below are two suggestions:
library(tidyverse)
first option:
bladdercancer %>%
group_by(tumorsize, number) %>%
# get frequencies/counts for each tumor size and for each number
summarise(n.cases = n()) %>%
ggplot() +
geom_mosaic(aes(weight = n.cases, x = product(number),
fill = factor(n.cases)), offset = 0) +
guides(fill=guide_legend(title = "Tumor Size")) +
labs(x="Number of tumors", title='Number of tumors vs Tumorsize') +
# remove background colour
theme_bw() +
theme(panel.grid.major = element_blank(),
# remove major and minor grids
panel.grid.minor = element_blank(),
# push title to the middle
plot.title = element_text(size = 10, hjust = .5))
where the categories within each column represent different counts for each tumor size e.g number 1 appears 15 times for <=3cm and 5 times for >3cm. I am however not able to partitions where the frequencies are the same, in this case number 3 and 4. Hence my option 2
second option:
ggplot(bladdercancer) +
geom_bar(aes(x = number, fill = tumorsize), position = "dodge")
The graph I'm currently trying to make falls a little between two stools. I want to make a histogram that is composed of stacked and labelled boxes. Here's an example of exactly the sort of thing I'm talking about, taken from a recent article in the New York Times:
http://farm8.staticflickr.com/7109/7026409819_1d2aaacd0a.jpg
Is it possible to achieve this using ggplot2?
To amplify the question somewhat, so far what I have is:
dfr <- data.frame(
name = LETTERS[1:26],
percent = rnorm(26, mean=15)
)
ggplot(dfr, aes(x=percent, fill=name)) + geom_bar() +
stat_bin(geom="text", aes(label=name))
...which I'm clearly doing all wrong. Ultimately what I'd ideally like is something along the lines of the manually-modified graph below, with (say) letters A to M filled one shade and N to Z filled another.
http://farm8.staticflickr.com/7116/7026536711_4df9a1aa12.jpg
Here you go!
set.seed(3421)
# added type to mimick which candidate is supported
dfr <- data.frame(
name = LETTERS[1:26],
percent = rnorm(26, mean=15),
type = sample(c("A", "B"), 26, replace = TRUE)
)
# easier to prepare data in advance. uses two ideas
# 1. calculate histogram bins (quite flexible)
# 2. calculate frequencies and label positions
dfr <- transform(dfr, perc_bin = cut(percent, 5))
dfr <- ddply(dfr, .(perc_bin), mutate,
freq = length(name), pos = cumsum(freq) - 0.5*freq)
# start plotting. key steps are
# 1. plot bars, filled by type and grouped by name
# 2. plot labels using name at position pos
# 3. get rid of grid, border, background, y axis text and lables
ggplot(dfr, aes(x = perc_bin)) +
geom_bar(aes(y = freq, group = name, fill = type), colour = 'gray',
show_guide = F) +
geom_text(aes(y = pos, label = name), colour = 'white') +
scale_fill_manual(values = c('red', 'orange')) +
theme_bw() + xlab("") + ylab("") +
opts(panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(),
axis.ticks = theme_blank(), panel.border = theme_blank(),
axis.text.y = theme_blank())