ggplot2 graphing and plotting average and minimum

ggplot2 graphing and plotting average and minimum - r

here is my code:
library(dplyr); library(tidyr)
T0.modified <- T0data %>%
# create year range based on each company's T0 year
mutate(Year.M1 = Year - 1,
Year.M2 = Year - 2,
Year.M3 = Year - 3,
Year.P1 = Year + 1,
Year.P2 = Year + 2,
Year.P3 = Year + 3) %>%
# convert to long format, match with Alldata based on both company & year
gather(reference.year, actual.year, -Company, -Price) %>%
left_join(Alldata, by = c("Company" = "Company", "actual.year" = "Year")) %>%
# keep T0 price for year T0, & use matched prices for all other years
mutate(Price = ifelse(reference.year == "Year", Price.x, Price.y)) %>%
# take maximum of all matched prices for each company each year
group_by(Company, reference.year) %>%
summarise(Price = max(Price)) %>%
ungroup() %>%
# order reference.year for correct sequence in ggplot's x-axis
mutate(reference.year = factor(reference.year,
levels = c("Year.M3", "Year.M2", "Year.M1", "Year",
"Year.P1", "Year.P2", "Year.P3"),
labels = c("T-3", "T-2", "T-1", "T0", "T+1", "T+2", "T+3")))
ggplot(T0.modified,
aes(x = reference.year, y = Price, group = Company, color = Company)) +
geom_line(aes()) +
xlab("Year") + theme_bw() +
stat_summary(fun.y = mean, geom = "line", group = 1,
linetype = 2, size = 1.5, colour = "grey") +
annotate("label", x = 7, y = 200, label = "Average",
fill = "grey", alpha = 0.5, hjust = 1)
And here is my data:
T0data:
structure(list(Company = structure(1:3, .Label = c("Amazon",
"Cisco", "McDonald's"), class = "factor"), Year = c(2011L, 2008L,
2013L), Price = c(182, 21.82, 95.15)), .Names = c("Company",
"Year", "Price"), row.names = c(NA, 3L), class = "data.frame")
All Data:
structure(list(Company = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Amazon", "Cisco", "McDonald's"), class = "factor"),
Year = c(2008L, 2008L, 2008L, 2008L, 2009L, 2009L, 2010L,
2010L, 2010L, 2011L, 2011L, 2012L, 2012L, 2013L, 2013L, 2014L,
2014L, 2014L, 2008L, 2010L, 2010L, 2010L, 2011L, 2011L, 2012L,
2012L, 2013L, 2013L, 2014L, 2014L, 2014L, 2015L, 2015L, 2016L,
2016L, 2016L, 2005L, 2005L, 2005L, 2006L, 2006L, 2007L, 2007L,
2007L, 2008L, 2008L, 2009L, 2009L, 2009L, 2010L, 2010L, 2011L,
2011L, 2011L), Price = c(91L, 77L, 81L, 87L, 63L, 88L, 110L,
75L, 117L, 170L, 190L, 215L, 245L, 316L, 275L, 330L, 378L,
390L, 55L, 62L, 66L, 65L, 72L, 98L, 93L, 88L, 99L, 101L,
94L, 103L, 96L, 99L, 116L, 112L, 123L, 113L, 19L, 17L, 18L,
20L, 19L, 26L, 31L, 27L, 24L, 21L, 14L, 22L, 18L, 26L, 22L,
14L, 16L, 15L)), .Names = c("Company", "Year", "Price"), class = "data.frame", row.names = c(NA,
-54L))
Here's my question:
How can I make the line graph show only 2 values, the average, and the minimum for all values?
And How can I plot a random company to represent the third line in the graph too to compare it to the minimum and the average?

Something like this? It plots the average, the minimum and a random company (see subset).
p = ggplot(T0.modified) + xlab("Year") + theme_bw() +
stat_summary(aes(x = reference.year, y = Price),fun.y = mean, geom = "line", group = 1, linetype = 2, size = 1.5, colour = "grey") +
stat_summary(aes(x = reference.year, y = Price),fun.y = min, geom = "line", group = 1, linetype = 2, size = 1.5, colour = "red") +
annotate("label", x = 7, y = 200, label = "Average", fill = "grey", alpha = 0.5, hjust = 1) +
annotate("label", x = 7, y = 30, label = "Min", fill = "grey", alpha = 0.5, hjust = 1) +
geom_line(data = subset(T0.modified,Company=="Amazon"),aes(x = reference.year, y = Price,group=Company),color="blue")

Related

Keep missing data in ggplot2 stacked barplot

This is my sample data. ID 144 contains 6 positions while ID AB01 contains only 3. In a stacked plot I still want to show 6 positions in AB01 with missing positions shown in a specific color .
ID YEAR POS
144 2017 10
144 2017 12
144 2017 18
144 2017 15
144 2017 163
144 2017 200
AB01 2018 10
AB01 2018 15
AB01 2018 18
This is what I tried.
ggplot(data1, aes(x = ID, y=1, fill = as.factor(POS))) +
geom_bar(stat = "identity", position = "stack", exclude = NULL) +
facet_wrap(~ data1$Year, ncol=1, scale="free") +
labs(x="Year", y= "Number ", fill = "Position", Title= "Pos plot") +
theme(text = element_text(size = 15, color = "Black"))
data
data <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("144", "AB01"), class = "factor"), YEAR = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L), POS = c(10L, 12L, 18L, 15L, 163L, 200L, 10L, 15L, 18L)), class = "data.frame", row.names = c(NA, -9L))

Can you use geom_tile instead?
data <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("144", "AB01"), class = "factor"), YEAR = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L), POS = c(10L, 12L, 18L, 15L, 163L, 200L, 10L, 15L, 18L)), class = "data.frame", row.names = c(NA, -9L))
ggplot(data, aes(x = ID, y = as.factor(POS), fill = as.factor(POS))) +
geom_tile(color = "black") +
coord_cartesian(expand = F) + # get rid of space around tiles
theme_classic() # make background white

ggplot(data, aes(x = ID, y = as.factor(POS), fill = as.factor(POS))) +
geom_tile(color = "black") + facet_wrap(~ data1$Year, ncol=2, scale="free_x") +
coord_cartesian(expand = F) + theme(strip.background = element_blank(), strip.text.x = element_blank())

How about this:
data <- structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("144", "AB01"), class = "factor"), YEAR = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L), POS = c(10L, 12L, 18L, 15L, 163L, 200L, 10L, 15L, 18L)), class = "data.frame", row.names = c(NA, -9L))
library(ggplot2)
library(forcats)
library(tidyr)
library(dplyr)
data_1 <-
data %>%
mutate(temp = as.character(POS)) %>%
complete(ID, POS) %>%
mutate(temp = fct_explicit_na(fct_inseq(temp), na_level = "Missing"))
col_map <- c("10" = "powderblue",
"12" = "red",
"18" = "orange",
"15" = "yellow",
"163" = "green",
"200" = "blue",
"Missing" = "White")
ggplot(data_1, aes(x = ID, y = fct_rev(factor(POS)), fill = temp)) +
geom_tile(color = "black", width = 0.5, height = 0.8) +
scale_fill_manual(values = col_map)+
coord_cartesian(expand = F) +
labs(x = NULL,
y = NULL,
fill = NULL)+
theme_classic()+
theme(axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size = 14),
axis.line = element_blank())
Created on 2020-07-08 by the reprex package (v0.3.0)

How to remove zero frequency for frequency plot and fix time?

When I produce a frequency plot:
Data <- structure(list(Venue = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Conference", "Journal"), class = "factor"), Year = c(2008L,
2009L, 2010L, 2011L, 2012L, 2013L, 2014L, 2015L, 2016L, 2017L,
2018L, 2019L, 2008L, 2009L, 2010L, 2011L, 2012L, 2013L, 2014L,
2015L, 2016L, 2017L, 2018L), Frequency = c(0L, 0L, 0L, 0L, 1L,
1L, 2L, 1L, 4L, 4L, 11L, 3L, 2L, 1L, 0L, 0L, 3L, 5L, 3L, 7L,
8L, 19L, 10L)), class = "data.frame", row.names = c(NA, -23L))
library(ggplot2)
ggplot(Data, aes(x = Year, y = Frequency, fill = Venue, label = Frequency)) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5))
I receive in the plot value with zero and the year in x axis does not seem as the data frame
How is it possible to remove zero frequency from plot (but keep from year i.e. 2012 the record in the plot) and show in x axis all years for every bar?

Is this what you want?
The code to get it is:
ggplot(Data, aes(x = as.character(Year), y = Frequency, fill = Venue,
label = ifelse(Frequency > 0, Frequency, numeric(0)))) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
scale_x_discrete(name ="Year")

Defining T0 in my program

Here's a small program I'm making, to eventually get a final graph. I have 2 separate data sets. One is called T0 and the second one contains all the data I have. I want this program to get the T0 values from the the first data frame, then it searches about the maximum price in the 3 years before and the 3 years after the T0 year.
In essence, my program is going to assign T0 values that I chose arbitrarily. Then it will search automatically in my database for the maximum price in each year except the t0 year.
The problem I'm facing, is with the implementation of T0 values in the schedule. It just does not come out right when I run my code.
The problem apparently has to do with the way I'm defining T0. Should I use a for loop? or is there a small tweak I'm missing?
Final result wanted:
Data Base Example:
T0data:
structure(list(Company = structure(1:3, .Label = c("Amazon",
"Cisco", "McDonald's"), class = "factor"), Year = c(2011L, 2008L,
2013L), Price = c(182, 21.82, 95.15)), .Names = c("Company",
"Year", "Price"), row.names = c(NA, 3L), class = "data.frame")
All Data:
structure(list(Company = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Amazon", "Cisco", "McDonald's"), class = "factor"),
Year = c(2008L, 2008L, 2008L, 2008L, 2009L, 2009L, 2010L,
2010L, 2010L, 2011L, 2011L, 2012L, 2012L, 2013L, 2013L, 2014L,
2014L, 2014L, 2008L, 2010L, 2010L, 2010L, 2011L, 2011L, 2012L,
2012L, 2013L, 2013L, 2014L, 2014L, 2014L, 2015L, 2015L, 2016L,
2016L, 2016L, 2005L, 2005L, 2005L, 2006L, 2006L, 2007L, 2007L,
2007L, 2008L, 2008L, 2009L, 2009L, 2009L, 2010L, 2010L, 2011L,
2011L, 2011L), Price = c(91L, 77L, 81L, 87L, 63L, 88L, 110L,
75L, 117L, 170L, 190L, 215L, 245L, 316L, 275L, 330L, 378L,
390L, 55L, 62L, 66L, 65L, 72L, 98L, 93L, 88L, 99L, 101L,
94L, 103L, 96L, 99L, 116L, 112L, 123L, 113L, 19L, 17L, 18L,
20L, 19L, 26L, 31L, 27L, 24L, 21L, 14L, 22L, 18L, 26L, 22L,
14L, 16L, 15L)), .Names = c("Company", "Year", "Price"), class = "data.frame", row.names = c(NA,
-54L))
My code:
library(data.table)
T0data<- read.csv(file = "C:/Users/My first file.csv", header = TRUE )
Alldata<- read.csv(file = "C:/Users/My second file.csv", header = TRUE )
d<-Alldata
setDT(d)
year_zero <- T0data$Year
# Filter to include year_zero +/- 3 years and get Best result per company per year
d <- d[Year >= year_zero - 3 & Yeae <= year_zero + 3,
.(Best_Result = max(Price, na.rm = TRUE)), by = .(Company, Year)]
# Add T as interval to year_zero (and convert to factor in order to get all
# values from 3 to 3
d[, "T" := factor(Year - year_zero, levels = seq(-3, 3), ordered = TRUE)]
# Cast to wide format (fill missing values with NA)
dcast(d, Company ~T, value.var = "Best_Result", drop = FALSE)
# Cast to wide format (fill missing values with "")
dcast(d, Company~T, value.var = "Best_Result", drop = FALSE, fun.aggregate = paste0,
fill = "")

Here's a solution that uses dplyr / tidyr packages from the tidyverse, rather than data.table, but it should do the job:
library(dplyr); library(tidyr)
T0.modified <- T0data %>%
# create year range based on each company's T0 year
mutate(Year.M1 = Year - 1,
Year.M2 = Year - 2,
Year.M3 = Year - 3,
Year.P1 = Year + 1,
Year.P2 = Year + 2,
Year.P3 = Year + 3) %>%
# convert to long format, match with Alldata based on both company & year
gather(reference.year, actual.year, -Company, -Price) %>%
left_join(Alldata, by = c("Company" = "Company", "actual.year" = "Year")) %>%
# keep T0 price for year T0, & use matched prices for all other years
mutate(Price = ifelse(reference.year == "Year", Price.x, Price.y)) %>%
# take maximum of all matched prices for each company each year
group_by(Company, reference.year) %>%
summarise(Price = max(Price)) %>%
ungroup() %>%
# order reference.year for correct sequence in ggplot's x-axis
mutate(reference.year = factor(reference.year,
levels = c("Year.M3", "Year.M2", "Year.M1", "Year",
"Year.P1", "Year.P2", "Year.P3"),
labels = c("T-3", "T-2", "T-1", "T0", "T+1", "T+2", "T+3")))
Resulting plot:
library(ggplot2)
ggplot(T0.modified,
aes(x = reference.year, y = Price, group = Company, color = Company)) +
geom_line(aes()) +
xlab("Year") + theme_bw()
Edit adding average for each year using stat_summary:
ggplot(T0.modified,
aes(x = reference.year, y = Price, group = Company, color = Company)) +
geom_line(aes()) +
xlab("Year") + theme_bw() +
stat_summary(fun.y = mean, geom = "line", group = 1,
linetype = 2, size = 1.5, colour = "grey") +
annotate("label", x = 7, y = 200, label = "Average",
fill = "grey", alpha = 0.5, hjust = 1)

ggplot2 legends: Adding items not in plot code to the legend & changing shapes in legend

I have a complex plot that shows dots of different colors for patient grades in different years, and lines of different colors connecting repeated measures (the same patients) measured in the two years whose grading has changed. As you can see the legend simply lists the dots with lines for the two colors. What I need, however, is a legend that has a red dot for 2009 and a blue dot for 2016 (without lines!), as well as a red line that I can label "Upgraded" and a blue line I can label "Downgraded". So I need four legend items: 2 dots, 2 lines, 4 labels. I've done extensive searching on this, and cannot find the answer.
Here's the plot code I used to build all the aesthetics I wanted:
sampledata <- structure(list(patient = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L,
5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L,
12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L,
19L, 19L), grade = structure(c(5L, 5L, 5L, 5L, 5L, 1L, 5L, 1L,
1L, 5L, 1L, 5L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 5L, 2L, 5L, 3L, 3L,
3L, 3L, 2L, 4L, 2L, 4L, 4L, 1L, 4L, 1L, 4L, 2L, 4L, 2L), .Label = c("grade_I",
"grade_II", "grade_III", "indeterminate", "normal"), class = "factor"),
year = c(2009L, 2016L, 2009L, 2016L, 2009L, 2016L, 2009L,
2016L, 2009L, 2016L, 2009L, 2016L, 2009L, 2016L, 2009L, 2016L,
2009L, 2016L, 2009L, 2016L, 2009L, 2016L, 2009L, 2016L, 2009L,
2016L, 2009L, 2016L, 2009L, 2016L, 2009L, 2016L, 2009L, 2016L,
2009L, 2016L, 2009L, 2016L)), .Names = c("patient", "grade",
"year"), class = "data.frame", row.names = c(NA, -38L))
yearf = factor(year)
gradef = factor(gradef, levels=c("normal", "grade_I", "grade_II", "grade_III", "indeterminate"))
p <- ggplot(data=guidegrades2, aes(x=gradef, y=patient, group=patient, color=yearf)) +
geom_point() + geom_line()
p + scale_colour_brewer(palette = "Set1") +
labs(x = "ASE Grade", y = "Patient", color = "ASE Guidelines")
pticks = p + scale_x_discrete(labels=c("grade_I" = "Grade I", "grade_II" = "Grade II",
"grade_III" = "Grade III", "indeterminate" = "Indeterminate", "normal" = "Normal"))
ptheme = pticks + theme(panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), legend.key = element_rect(fill = "white"), axis.line = element_line(), axis.title.x=element_text(vjust=0.0001))
paxes = ptheme + scale_colour_brewer(palette = "Set1") +
labs(x = "ASE Grade", y = "Patient", color = "ASE Guidelines")

Error (ymax not found) while adding a geom_line (different data) to a multi-layered ggplot

I'm trying to generate a plot that summarizes a dataset by first plotting the median & quantiles in an area / black line, after which I want to outline a specific 'firm' with a red line.
I'd also like to do so while facetting on a variable, thus plotting multiple variables at once.
An example code of what I'd plot is as follows:
require(dplyr)
require(ggplot2)
dt <- structure(list(Firm = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("a", "b", "c", "d"), class = "factor"), Year = c(2008L,
2009L, 2008L, 2009L, 2008L, 2009L, 2008L, 2009L, 2008L, 2009L,
2008L, 2009L, 2008L, 2009L, 2008L, 2009L, 2008L, 2009L, 2008L,
2009L, 2008L, 2009L, 2008L, 2009L), variable = structure(c(1L,
1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 2L, 2L, 3L,
3L, 1L, 1L, 2L, 2L, 3L, 3L), .Label = c("var1", "var2", "var3"
), class = "factor"), value = c(0.991894223, 2.216322113, 3.189415462,
0.663732077, 0.444826423, 2.674568191, 1.272077011, 7.691464914,
4.263339855, 0.214415839, 3.995328653, 6.028747322, 8.191459456,
0.16205906, 4.056495056, 5.17994524, 0.42435417, 0.678655669,
6.246411921, 0.505532339, 4.65045746, 8.85141854, 5.850616048,
2.028583225)), .Names = c("Firm", "Year", "variable", "value"
), class = "data.frame", row.names = c(NA, -24L))
head(dt)
Firm Year variable value
1 a 2008 var1 0.9918942
2 a 2009 var1 2.2163221
3 a 2008 var2 3.1894155
4 a 2009 var2 0.6637321
5 a 2008 var3 0.4448264
6 a 2009 var3 2.6745682
I now manually calculate the ymin, ymax, and y for the ribbon / line plots. They're plotting just fine.
dt_aggregates <- dt %>%
group_by(variable, Year) %>%
arrange(variable, Year) %>%
summarize(y=median(value))
dt_aggregates$ymin <- 0.9*dt_aggregates$y
dt_aggregates$ymax <- 1.1*dt_aggregates$y
This will be the firm I want to highlight:
dt_focus <- filter(dt, Firm=="a")
The following plots just fine, and is almost what I want.
g <- ggplot(data=dt_aggregates,
aes(x=Year,
y=y,
ymax=ymax,
ymin=ymin,
group=variable)) +
facet_grid(variable~., scales="free") + geom_line() + geom_ribbon(alpha=0.3) +
theme_bw()
However, I want to add another line (for the one firm) onto this (in red).
Once I try to add a new line with a separate dataframe, I get the following error. Any help on getting this to work is greatly appreciated
# Error in eval(expr, envir, enclos) : object 'ymax' not found
g + geom_line(data=dt_focus,
aes(x=Year, y=value, group=variable),
col="red", size=2)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

ggplot2 graphing and plotting average and minimum - r

Related

Keep missing data in ggplot2 stacked barplot

How to remove zero frequency for frequency plot and fix time?

Defining T0 in my program

ggplot2 legends: Adding items not in plot code to the legend & changing shapes in legend

Error (ymax not found) while adding a geom_line (different data) to a multi-layered ggplot

Categories

Resources