I want to display both an annual value (school hours) and its cumulative (total school hours) in the same graph. This works, so now I want to tidy-up the legend a little bit. I want to drop the three groups associated with the cumulative values and only report a custom label for each state I use. Thus, the legend should only read "NW, G8", "NW, G9", and "CA", next to the colour they are associated with.
I have found other solutions to this problem which no longer seem to work with the current ggplot version (at least I believe this to be a version-issue, maybe I made another mistake):
https://community.rstudio.com/t/how-keep-aesthetic-mapping-but-remove-a-specific-item-from-legend-with-ggplot/52818/3
-> trying to replicate the solution with the provided code results in a gray barplot "c" and not a blue bar plot for me.
Remove legend entries for some factors levels
-> again, similar solution, but if I update my scale_color_manual to the following alternative, I again have gray lines for my cumulative values
scale_color_manual(breaks = c("hours_nw_G8", "hours_nw_G9", "hours_ca"),
values = c("#073B4C", "#118AB2", "#FFD166", "#073B4C", "#118AB2", "#FFD166")) +
Code:
require(tidyverse)
## school hours: comparison of US state (CA) and DE State (NW)
df_hours <- data.frame(year=c(1:13),
hours_nw_G8=c(21.5,22.5,25.5,26.5,31.5,31.5,32.5,32.5,33.5,34,34,34,NA),
hours_nw_G9=c(21.5,22.5,25.5,26.5,28,29,30,30,31,31,31.5,29.5,29.5),
hours_ca=c(840,840,840,900,900,900,900,900,1080,1080,1080,1080,NA)
)
df_hours$hours_nw_G8 <- df_hours$hours_nw_G8 * 38 * 0.75 # scaling by #weeks and accounting for German school hour
df_hours$hours_nw_G9 <- df_hours$hours_nw_G9 * 38 * 0.75
# cumulate
df_hours$c_hours_nw_G8 <- cumsum(df_hours$hours_nw_G8) / 8 # cummulate and divide by scaling factor
df_hours$c_hours_nw_G9 <- cumsum(df_hours$hours_nw_G9) / 8
df_hours$c_hours_ca <- cumsum(df_hours$hours_ca) / 8
# reshape & dummy for cumulative
df_hours <- gather(df_hours, state, hours, hours_nw_G8:c_hours_ca, factor_key=TRUE)
df_hours$cu <- c(rep("annual",13*3),rep("cumulative",13*3))
# Figure
ggplot(df_hours, aes(x=year)) +
geom_line(aes(y=hours, color=state, linetype=cu), linewidth = 1) +
scale_y_continuous(limits = c(0,1500), expand = c(0,0),
breaks = c(0,250,500,750,1000,1250,1500),
name="Hours (annual)",
sec.axis = sec_axis(~ .*8, name = "Hours (cummulative)",
breaks = c(0,2000,4000,6000,8000,10000,12000))
) +
scale_x_continuous(limits = c(0.5,13.5), expand = c(0,0), breaks = c(1:13)) +
labs(x="School Year", y="Hours") +
theme_tufte() +
scale_color_manual(values = c("#073B4C", "#118AB2", "#FFD166", "#073B4C", "#118AB2", "#FFD166")) +
theme(axis.line = element_line(linewidth = 0.75), text = element_text(size = 10, color = "black"),
legend.position = c(.3, .9), legend.title = element_blank()) +
guides(linetype = F)
One solution could be using some stringr function to modify the strings:
library(ggthemes)
library(tidyverse)
df_hours %>%
mutate(state_label = str_remove(state, "c_hours_|hours_"),
state_label = str_to_upper(state_label),
state_label = str_replace(state_label, "_", ", ")) %>%
ggplot(aes(x=year)) +
geom_line(aes(y=hours, color=state_label, linetype=cu), linewidth = 1) +
scale_y_continuous(limits = c(0,1500), expand = c(0,0),
breaks = c(0,250,500,750,1000,1250,1500),
name="Hours (annual)",
sec.axis = sec_axis(~ .*8, name = "Hours (cummulative)",
breaks = c(0,2000,4000,6000,8000,10000,12000))
) +
scale_x_continuous(limits = c(0.5,13.5), expand = c(0,0), breaks = c(1:13)) +
labs(x="School Year", y="Hours") +
theme_tufte() +
scale_color_manual(values = c("#073B4C", "#118AB2", "#FFD166", "#073B4C", "#118AB2", "#FFD166")) +
theme(axis.line = element_line(0.75), text = element_text(size = 10, color = "black"),
legend.position = c(.3, .9), legend.title = element_blank()) +
guides(linetype = F)
Related
I'm trying to plot Alluvial Plots using ggplot. So far it went well until I want to try to clean the plot up.
As you can see on the plot, from left to right, the first stratum/column is the ID column then it follows by a column of labels: disease risk. What I want to achieve is in the out plot, instead of having the patient IDs zigzagging, I want them to be ordered by disease risk column, so that all the high risk IDs are all together on top, followed by low risk then the not filled ones. In this way it is much easier to see if there's any relations.
I have looked around for the arrange() and order() functions, they seem to do the trick for my actual input data but once I pass that data frame in ggplot, the output figure is still scrambled.
I thought of set the IDs to factor, then use levels=.... But this is not very smart if the patient ID keeps growing.
Is there a smarter way? please enlighten me. I have attached a link towards the sample data.
https://drive.google.com/file/d/16Pd8V3MCgEHmZEButVi2UjDiwZWklK-T/view?usp=sharing
My code to plot the graph :
library(tidyr)
library(ggplot2)
library(ggalluvial)
library(RColorBrewer)
# Define the number of colors you want
nb.cols <- 10
mycolor1 <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
mycolors <- c("Black")
#read the data
CLL3S.plusrec <- read.csv("xxxx.CSV", as.is = T)
CLL3S.plusrec$risk_by_DS <- factor(CLL3S.plusrec$risk_by_DS, levels = c("low_risk", "high_risk", "Not filled"))
CLL3S.plusrec$`Enriched response phenotype` <- factor(CLL3S.plusrec$`Enriched response phenotype`, levels = c("Live cells","Pre-dead", "TN & PDB", "PDB & Lenalidomide", "TN & STSVEN & Live cells","Mixed"))
#here I reorder the dataframe and it looks good
#but the output ggplot changes the order of ID in the output graph
OR <- with(CLL3S.plusrec, CLL3S.plusrec[order(risk_by_DS),])
d <-ggplot(OR, aes(y = count,
axis1= Patient.ID,
axis2= risk_by_DS,
axis3 = `Cluster assigned consensus`,
axis4 = `Cluster assigned single drug`,
axis5 = `Enriched response phenotype`
)) +
scale_x_discrete(limits = c("Patient ID","Disease Risk", "Consensus cluster", "Single-drug cluster", "Enriched drug response by Phenoptype")) +
geom_alluvium(aes(fill=`Cluster assigned consensus`)) +
geom_stratum(width = 1/3, fill = c(mycolor1[1:69],mycolor1[1:3],mycolor1[1:8],mycolor1[1:8],mycolor1[1:6]), color = "red") +
#geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=3) +
theme(axis.title.x = element_text(size = 15, face="bold"))+
theme(axis.title.y = element_text(size = 15, face="bold"))+
theme(axis.text.x = element_text(size = 10, face="bold")) +
theme(axis.text.y = element_text(size = 10, face="bold")) +
labs(fill = "Consensus clusters")+
guides(fill=guide_legend(override.aes = list(color=mycolors)))+
ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
"3S stimulated patients")
print(d)
Not sure if this is what you want, try formating the risk column in this way:
library(tidyr)
library(ggplot2)
library(ggalluvial)
library(RColorBrewer)
# Define the number of colors you want
nb.cols <- 10
mycolor1 <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
mycolors <- c("Black")
#read the data
CLL3S.plusrec <- read.csv("test data.CSV", as.is = T)
CLL3S.plusrec$risk_by_DS <- factor(CLL3S.plusrec$risk_by_DS,
levels = c("high_risk","low_risk","Not filled"),ordered = T)
CLL3S.plusrec$Enriched.response.phenotype <- factor(CLL3S.plusrec$Enriched.response.phenotype, levels = c("Live cells","Pre-dead", "TN & PDB", "PDB & Lenalidomide", "TN & STSVEN & Live cells","Mixed"))
#here I reorder the dataframe and it looks good
#but the output ggplot changes the order of ID in the output graph
OR <- with(CLL3S.plusrec, CLL3S.plusrec[order(risk_by_DS),])
ggplot(OR, aes(y = count,
axis1= reorder(Patient.ID,risk_by_DS),
axis2= risk_by_DS,
axis3 = reorder(Cluster.assigned.consensus,risk_by_DS),
axis4 = reorder(Cluster.assigned.single.drug,risk_by_DS),
axis5 = reorder(Enriched.response.phenotype,risk_by_DS)
)) +
scale_x_discrete(limits = c("Patient ID","Disease Risk", "Consensus cluster", "Single-drug cluster", "Enriched drug response by Phenoptype")) +
geom_alluvium(aes(fill=Cluster.assigned.consensus)) +
geom_stratum(width = 1/3, fill = c(mycolor1[1:69],mycolor1[1:3],mycolor1[1:8],mycolor1[1:8],mycolor1[1:6]), color = "red") +
#geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=3) +
theme(axis.title.x = element_text(size = 15, face="bold"))+
theme(axis.title.y = element_text(size = 15, face="bold"))+
theme(axis.text.x = element_text(size = 10, face="bold")) +
theme(axis.text.y = element_text(size = 10, face="bold")) +
labs(fill = "Consensus clusters")+
guides(fill=guide_legend(override.aes = list(color=mycolors)))+
ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
"3S stimulated patients")
Output:
Also in my read.csv() the quotes got off and dots are in the variables. That is why your original quoted variables now have dots. Maybe an issue from reading.
Update:
#Update
OR <- with(CLL3S.plusrec, CLL3S.plusrec[order(risk_by_DS),])
OR <- OR[order(OR$risk_by_DS,OR$Patient.ID),]
OR$Patient.ID <- factor(OR$Patient.ID,levels = unique(OR$Patient.ID),ordered = T)
#Plot
ggplot(OR, aes(y = count,
axis1= reorder(Patient.ID,risk_by_DS),
axis2= risk_by_DS,
axis3 = reorder(Cluster.assigned.consensus,risk_by_DS),
axis4 = reorder(Cluster.assigned.single.drug,risk_by_DS),
axis5 = reorder(Enriched.response.phenotype,risk_by_DS)
)) +
scale_x_discrete(limits = c("Patient ID","Disease Risk", "Consensus cluster", "Single-drug cluster", "Enriched drug response by Phenoptype")) +
geom_alluvium(aes(fill=Cluster.assigned.consensus)) +
geom_stratum(width = 1/3, fill = c(mycolor1[1:69],mycolor1[1:3],mycolor1[1:8],mycolor1[1:8],mycolor1[1:6]), color = "red") +
#geom_stratum() +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size=3) +
theme(axis.title.x = element_text(size = 15, face="bold"))+
theme(axis.title.y = element_text(size = 15, face="bold"))+
theme(axis.text.x = element_text(size = 10, face="bold")) +
theme(axis.text.y = element_text(size = 10, face="bold")) +
labs(fill = "Consensus clusters")+
guides(fill=guide_legend(override.aes = list(color=mycolors)))+
ggtitle("Patient flow between the Consensus clusters and Single-drug treated clusters",
"3S stimulated patients")
Output:
I would like to know how to use ggplot2 to create a side by side plots with one common legend. I have seen some similar questions but not sure how to directly apply it to my code. I have provided my code for the graphs with the legend and some data that can be used to recreate the graphs.
Stocks1<-c(2,1,0.8,0.7,0.6)
Bonds1<-c(1,0.8,0.7,0.6,0.5)
Cash1<-1-(Stocks1+Bonds1)
Stocks2<-c(0.6,0.5,0.4,0.3,0.2)
Bonds2<-c(0.3,0.2,0.2,0.15,0.1)
Cash2<-1-(Stocks2+Bonds2)
H<-length(Stocks1) #Change value to represent data
t <- seq(from = 0, to = H, 1) # time grid
And here are the two graphs
pi_F<- data.frame(cash = Cash1, bonds = Bonds1,
stocks= Stocks1,time=t[-1])
melted_F <- melt(pi_F, id.vars = 'time')
ggplot(melted_F, aes(x=time, y=value, group = variable)) +
geom_area(aes(fill=variable)) +
scale_fill_manual(values=c("#2E318F", "#DFAE41","#109FC6"),
name="Asset Type",
labels = c("Bank account","Bonds", "Stocks"))+
scale_x_continuous(name = 'Age',
breaks = seq(1,H,1)) +
scale_y_continuous(name = 'Asset allocation (in %)',
labels=scales::percent,
breaks = seq(0,1,0.1),
sec.axis = sec_axis(~.*1,breaks = seq(0,1,0.1),labels=scales::percent)) +
coord_cartesian(xlim = c(1,H), ylim = c(0,1), expand = TRUE) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
pi_F<- data.frame(cash = Cash2, bonds = Bonds2,
stocks= Stocks2,time=t[-1])
melted_F <- melt(pi_F, id.vars = 'time')
ggplot(melted_F, aes(x=time, y=value, group = variable)) +
geom_area(aes(fill=variable)) +
scale_fill_manual(values=c("#2E318F", "#DFAE41","#109FC6"),
name="Asset Type",
labels = c("Bank account","Bonds", "Stocks"))+
scale_x_continuous(name = 'Age',
breaks = seq(1,H,1)) +
scale_y_continuous(name = 'Asset allocation (in %)',
labels=scales::percent,
breaks = seq(0,1,0.1),
sec.axis = sec_axis(~.*1,breaks = seq(0,1,0.1),labels=scales::percent)) +
coord_cartesian(xlim = c(1,H), ylim = c(0,1), expand = TRUE) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
Idealy I would like these side by side with the legend in an appropriate place, probably to the right of both graphs. Thanks in advance for the help!
Put your data together and use facets:
## calling the first data `melted_F` and the second `melted_F2`
## put them in one data frame with a column named "data" to tell
## which is which
melted = dplyr::bind_rows(list(data1 = melted_F, data2 = melted_F2), .id = "data")
## exact same plot code until the last line
ggplot(melted, aes(x=time, y=value, group = variable)) +
geom_area(aes(fill=variable)) +
scale_fill_manual(values=c("#2E318F", "#DFAE41","#109FC6"),
name="Asset Type",
labels = c("Bank account","Bonds", "Stocks"))+
scale_x_continuous(name = 'Age',
breaks = seq(1,H,1)) +
scale_y_continuous(name = 'Asset allocation (in %)',
labels=scales::percent,
breaks = seq(0,1,0.1),
sec.axis = sec_axis(~.*1,breaks = seq(0,1,0.1),labels=scales::percent)) +
coord_cartesian(xlim = c(1,H), ylim = c(0,1), expand = TRUE) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) +
## facet by the column that identifies the data source
facet_wrap(~ data)
I'm trying to plot a two "y" axis, the first one with "soil water content" in % (geom_line() ) and a geom_bar() with precipitation data. The problem is the precipitation chart. I need to "reverse" the plot.
I have this chart now:
and I need the soil water content time series as image above but precipitation as following image
Transforming the data do not solve the problem since i can not transform the bar plot for visualize it form upside down since the data are bars not points. Overmore when i reverse the plot both axis turn reverse
the "hum_melt10" data frame is a data frame with 3 columns: fecha = date (daily), value = water content (%) and variable = if the data is from a probe o from a model
the "pp_melt" data frame is a data frame with 3 columns: fecha = date (daily), value = cm of precipitation for each day and variable = if the water is from precipitation or irrigation
gpp = ggplot() +
geom_line(data = hum_melt10,aes(x = fecha, y = value, color = variable), size = 1.0) +
xlab("Fecha") +
geom_bar(data = pp_melt, aes(x = fecha, y = value / 20, fill = variable), stat="identity",position = 'dodge', na.rm = TRUE) +
scale_y_continuous(name = "Contenido de agua (%)",sec.axis = sec_axis(~.*20, name = "pp y riego (cm)")) +
scale_x_date(breaks = '2 month', labels = fecha, date_labels = '%b %y') +
theme(plot.title = element_text(lineheight=.8, face="bold", size = 20)) +
theme_bw() + theme( panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), aspect.ratio = 0.3)
Thank you!
Agreeing with #dmp, the use of sec_axis only adds the labels to the right; if you want to flip how it looks in the plot, you need to either
Use scale_y_reverse(), which will flip everything;
Manually flip the series in the aesthetic; or
Manually flip the series in the data itself.
Since you only want to reverse one of the data series and not all of them, #1 is out. #3 does work, though you'll still need to modify the call to sec_axis, so I'll keep it simple with doing just #2.
library(ggplot2)
mt <- transform(mtcars, rn = 1:nrow(mtcars))
ggplot(mt) +
geom_bar(aes(x = rn, y = drat), stat = "identity") +
geom_line(aes(x = rn, y = disp/100), stat = "identity", color = "red", size = 1) +
scale_y_continuous(sec.axis = sec_axis(~ . * 100))
Flipping just the red line, we change both how it is defined in the aesthetic (though it could be changed in the frame itself) and sec_axis.
ggplot(mt) +
geom_bar(aes(x = rn, y = drat), stat = "identity") +
geom_line(aes(x = rn, y = 5 - disp/100), stat = "identity", color = "red", size = 1) +
# changes: ^^^ vvv
scale_y_continuous(sec.axis = sec_axis(~ (5 - .) * 100))
(It's important to remember that the flipping point (5 in the example above) is based on the main-axis scale, not the new data.)
Without testing, I suspect the fix for your code would be something like this (where 9 is inferred from the plot).
gpp = ggplot() +
geom_line(data = hum_melt10,aes(x = fecha, y = value, color = variable), size = 1.0) +
xlab("Fecha") +
geom_bar(data = pp_melt, aes(x = fecha, y = 9 - value / 20, fill = variable), stat="identity",position = 'dodge', na.rm = TRUE) +
# changes: ^^^ vvv
scale_y_continuous(name = "Contenido de agua (%)", sec.axis = sec_axis(~(9 - .)*20, name = "pp y riego (cm)")) +
scale_x_date(breaks = '2 month', labels = fecha, date_labels = '%b %y') +
theme(plot.title = element_text(lineheight=.8, face="bold", size = 20)) +
theme_bw() + theme( panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), aspect.ratio = 0.3)
I am trying to chart 2 time series (indexed stock prices) on ggplot, AAPL and MSFT. I want to shade the the area between these two lines but only when the AAPL indexed price is higher than that of MSFT. How do I accomplish this?
I have been reading about using geom_ribbon() but saw that some people said it is problematic and doesn't work when the two lines do not cross. I also have not been able to get the code to work. How do I set my ymin and ymax values for geom_ribbon()? I tried geom_area() as well but then all I created was a stacked area graph.
Here is my code so far:
install.packages("tidyquant")
install.packages("ggplot2")
library(tidyquant)
library(ggplot2)
symbols <- c("AAPL", "MSFT")
data <- tq_get(symbols, get = "stock.prices", from = "2016-01-01")
S1_index <-data$adjusted[which(data$symbol == "AAPL" & data$date == min(data$date))]
S2_index <-data$adjusted[which(data$symbol == "MSFT" & data$date == min(data$date))]
data$adjusted <- ifelse(data$symbol == "AAPL", data$adjusted/S1_index,
ifelse(data$symbol == "MSFT", data$adjusted/S2_index,NA))
ggplot(data,aes(x=date, y=adjusted,colour= symbol)) +
geom_line() +
scale_colour_manual(values = c(AAPL = "darkblue", MSFT = "red")) +
ggtitle("Title Here") + xlab("X Axis Label Here") + ylab("Y Axis Label Here") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_date(date_labels = "%b %y", date_breaks = "6 months") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(color = "Company") +
theme(legend.title = element_blank())
I would like the area between two time series to be shaded when AAPL is higher than MSFT, but currently my code doesn't accomplish that. I'm not very proficient in using ggplot, so I would appreciate any advice you might have.
You can use a ribbon to show the area between the two lines, but it'll require a bit of tweaking to only show the area when AAPL is higher than MSFT. Assume data is the link to the .csv file you've posted and the dates were formatted. First, we're going to build a seperate data.frame in a typical ribbon-like format:
ribbondata <- data.frame(
# We'll keep the x-values for one of the lines
x = data$date[data$symbol == "AAPL"],
# Next we are going to take the pairwise minima and maxima along the lines
ymin = pmin(data$adjusted[data$symbol == "AAPL"], data$adjusted[data$symbol == "MSFT"]),
ymax = pmax(data$adjusted[data$symbol == "AAPL"], data$adjusted[data$symbol == "MSFT"]),
# Then, we'll save a variable for which observations to keep
keep = data$adjusted[data$symbol == "AAPL"] > data$adjusted[data$symbol == "MSFT"]
)
Then here is how I would filter out regions we do not want to shade, and also attach some id variable to the data that keeps track of stretches of data that we do indeed want to shade. We'll use run length encoding for this:
keep_rle <- rle(ribbondata$keep)
# Now we'll replace every TRUE with a counter integer
keep_rle$values[keep_rle$values] <- seq_len(sum(keep_rle$values))
Next, we'll attach the inverse of this run length encoded id to the ribbondata dataframe and remove the bits where ribbondata$KEEP == FALSE.
ribbondata$id <- inverse.rle(keep_rle)
ribbondata <- ribbondata[ribbondata$keep,]
Then, we'll use the plotting code you provided:
g <- ggplot(data,aes(x=date, y=adjusted,colour= symbol)) +
geom_line() +
scale_colour_manual(values = c(AAPL = "darkblue", MSFT = "red")) +
ggtitle("Title Here") + xlab("X Axis Label Here") + ylab("Y Axis Label Here") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_date(date_labels = "%b %y", date_breaks = "6 months") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"))+
labs(color = "Company") +
theme(legend.title = element_blank())
And attach our ribbondata to it:
g <- g + geom_ribbon(data = ribbondata,
aes(x = x, ymin = ymin, ymax = ymax, group = id),
inherit.aes = FALSE)
Now the trick here is to attach our calculated id variable to the group in the aes() call, so that ggplot doesn't interpret the ribbon as a continuous object and draw weird lines at x-values where ribbondata y-values are undefined. Also I've set inherit.aes = FALSE because the ribbondata has different names for x and ymin/ymax variables than the main data.
I ended up with the following plot:
Of course, you can give the ribbon any fill colour or alpha that you want. Good luck!
First reshape your data.
data <- data %>%
# Select down to the necessary columns
select(date, symbol, adjusted) %>%
# Pivot to create columns for both symbols
pivot_wider(names_from = symbol, values_from = adjusted) %>%
# Create new variables for ribbon
mutate(max1 = ifelse(AAPL >= MSFT, AAPL, MSFT)) %>%
mutate(max2 = ifelse(MSFT >= AAPL, MSFT, AAPL))
Next, create your ggplot object
g1 <- data %>%
# Set PlotAesthetics
ggplot(aes(x=date, y=AAPL)) +
# First ribbon creates the color above MSFT and below AAPL
geom_ribbon(aes(ymin=MSFT, ymax=AAPL), fill="grey", alpha=0.9) +
# Second ribbon removes anything below MSFT
geom_ribbon(aes(ymin=0, ymax=MSFT), fill="white", alpha=0.9) +
# Add lines for AAPL and MSFT
geom_line(aes(y=AAPL), color = "blue") +
geom_line(aes(y=MSFT), color = "red") +
# Create Labels
labs(x = "X Axis Label Here", y = "Y Axis Label Here",
title = "Title Here") +
# Set Theme to match your original plot
theme_classic() +
# Need to create custom legend
annotate(geom = "text", x = ymd('2020-06-01'), y = .25, label = "AAPL", hjust = "left") +
annotate(geom = "segment", x = ymd('2020-03-01'), xend = ymd('2020-05-01'), y = .25, yend = .25, colour = "blue", size = 1) +
annotate(geom = "text", x = ymd('2020-06-01'), y = .05, label = "MSFT", hjust = "left") +
annotate(geom = "segment", x = ymd('2020-03-01'), xend = ymd('2020-05-01'), y = .05, yend = .05, colour = "red", size = 1)
I realise this is a bit late, but is an alternative approach to achieving what #bgm was after.
Here is the associated plot
I'm using ggplot() to build two scatter plots that plot health assessment score for (1) male and (2) female patients vs. # weeks in treatment, plus I'm using geom_line() to plot regression line overlays for both the males and females on each graph.
My question: How do I match the colors of the line overlays with the colors of the scatter plot points ('steelblue2' and 'pink3') while still retaining the legend?
I've found if I move color outside of the aes() argument in geom_line(), the colors of the lines and scatterplot points match, but then the legend disappears.
My code & a sample from my data:
mean_behav_by_numweeks = data.frame(
numweeks_round = c(1:10),
Mean_Behavior_Score_Male = c(3.32,4.18,3.82,4.06,3.33, 3.80,3.64,3.66,3.37,3.82),
nrow_male = c(396,323,293,259,226,217,202,190,170,167),
lm_results_predict_male = c(3.82,3.80,3.78,3.76,3.74, 3.72,3.70,3.68,3.66,3.64),
Mean_Behavior_Score_Female = c(2.91,3.79,3.65,3.41, 2.88,2.88,3.78,2.98,3.67,3.93),
nrow_female = c(109,82,72,74,66,60,58,56,52,50),
lm_results_predict_female=c(3.44,3.44,3.45,3.45, 3.46,3.47,3.47,3.48,3.48,3.49))
gg_plot1 <- ggplot(mean_behav_by_numweeks,
aes(numweeks_round,
Mean_Behavior_Score_Male,
size = mean_behav_by_numweeks$nrow_male)) +
geom_point(colour='steelblue2') +
ggtitle(paste("Scatter plot of mean behavior assessment score by member by # weeks \n since 1st assessment for",
as.character(var),
"among Male Medi-Cal plan members")) +
theme(plot.title = element_text(size=10.9, hjust = 0.5)) +
theme(axis.text = element_text(size=8)) +
scale_size_continuous(range = c(1, 7)) +
xlab("Number of weeks since 1st assessment") +
ylab("Mean behavior assessment score") +
theme(legend.position="bottom") +
labs(size="# members") +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_male, color='steelblue2'),
size=1) +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_female, color='pink3'),
size=1) +
scale_color_discrete(name = "GenderCode", labels = c("Female", "Male")) +
theme(legend.position="bottom") +
guides(color = guide_legend(order=1, direction="vertical"))
gg_plot1
gg_plot2 <- ggplot(mean_behav_by_numweeks,
aes(numweeks_round,
Mean_Behavior_Score_Female,
size = mean_behav_by_numweeks$nrow_female)) +
geom_point(colour='pink3') +
ggtitle(paste("Scatter plot of mean behavior assessment score by member by # weeks \n since 1st assessment for",
as.character(var),
"among Female Medi-Cal plan members")) +
theme(plot.title = element_text(size=10.9, hjust = 0.5)) +
theme(axis.text = element_text(size=8)) +
scale_size_continuous(range = c(1, 7)) +
xlab("Number of weeks since 1st assessment") +
ylab("Mean behavior assessment score") +
theme(legend.position="bottom") +
labs(size="# members") +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_male, color='steelblue2'),
size=1) +
geom_line(data=mean_behav_by_numweeks,
aes(numweeks_round, lm_results_predict_female, color='pink3'), size=1) +
scale_color_discrete(name = "GenderCode", labels = c("Female", "Male")) +
theme(legend.position="bottom") +
guides(color = guide_legend(order=1, direction="vertical"))
windows()
gg_plot2
You will want to reshape your data into long format, although you don't have to use melt or gather if you don't want to -- you can stack your data manually, like
library(dplyr)
library(ggplot2)
new_df <- bind_rows(
male = select(mean_behav_by_numweeks,
numweeks_round,
Mean_Behavior_Score = Mean_Behavior_Score_Male,
nrow = nrow_male,
lm_results_predict = lm_results_predict_male),
female = select(mean_behav_by_numweeks,
numweeks_round,
Mean_Behavior_Score = Mean_Behavior_Score_Female,
nrow = nrow_female,
lm_results_predict = lm_results_predict_female),
.id = "gender"
)
Then you can just do
ggplot(new_df, aes(numweeks_round, Mean_Behavior_Score, size = nrow, colour = gender)) +
geom_point() +
theme(plot.title = element_text(size=10.9, hjust = 0.5),
axis.text = element_text(size=8),
legend.position="bottom") +
scale_size_continuous(range = c(1, 7)) +
labs(x = "Number of weeks since 1st assessment",
y = "Mean behavior assessment score",
size="# members") +
geom_line(aes(y = lm_results_predict), size = 1) +
scale_color_manual(name = "GenderCode", labels = c("Female", "Male"), values = c("pink3", "steelblue2")) +
guides(color = guide_legend(order=1, direction="vertical")) +
facet_wrap("gender")
which gives you
One can use gather/separate to first convert data in long format and then plot.
# A simple capitalization function to convert first letter in Caps
# This function is used to convert male/female to Male/Female
.simpleCap <- function(x) {
s <- strsplit(x, " ")[[1]]
paste(toupper(substring(s, 1, 1)), substring(s, 2),
sep = "", collapse = " ")
}
library(tidyverse)
df <- mean_behav_by_numweeks %>%
gather(key, value, - numweeks_round) %>%
separate(key, c("key", "GenderCode"), sep = "_(?=[^_]*?$)") %>% #separates on last _
mutate(GenderCode = mapply(.simpleCap,GenderCode)) %>%
spread(key, value)
Plot the graph:
ggplot(df, aes(numweeks_round, Mean_Behavior_Score, size = nrow, color = GenderCode )) +
geom_point() +
geom_line(aes(y = lm_results_predict, color = GenderCode), size = 1) +
theme(plot.title = element_text(size=10.9, hjust = 0.5),
axis.text = element_text(size=8),
legend.position="bottom") +
labs(x = "Number of weeks since 1st assessment",
y = "Mean behavior assessment score",
size="# members") +
guides(color = guide_legend(order=1, direction="vertical"))
Data:
mean_behav_by_numweeks = data.frame(
numweeks_round = c(1:10),
Mean_Behavior_Score_Male = c(3.32,4.18,3.82,4.06,3.33, 3.80,3.64,3.66,3.37,3.82),
nrow_male = c(396,323,293,259,226,217,202,190,170,167),
lm_results_predict_male = c(3.82,3.80,3.78,3.76,3.74, 3.72,3.70,3.68,3.66,3.64),
Mean_Behavior_Score_Female = c(2.91,3.79,3.65,3.41, 2.88,2.88,3.78,2.98,3.67,3.93),
nrow_female = c(109,82,72,74,66,60,58,56,52,50),
lm_results_predict_female=c(3.44,3.44,3.45,3.45, 3.46,3.47,3.47,3.48,3.48,3.49))