R ggplot2: Adding a Legend to a Time Series with Forecasts - r

I've been looking at this for hours stumped. I've come across a number of suggestions that I need to add aes() and assigning colours to the geom_lines but this isn't generating anything - potentially as I have some forecasts in as well? I'm really not too sure.
In any case I've put my code below, and really appreciate any help that can be provided.
install.packages("fpp")
library(fpp)
data("books")
library(ggplot2)
paperback <- books[,1]
fit1 <- ses(paperback, alpha = 0.2, initial = "simple", h = 3)
fit2 <- ses(paperback, alpha = 0.6, initial = "simple", h = 3)
fit3 <- ses(paperback, h = 3)
autoplot(paperback,
xlab="Day", main="", size = 20) +
geom_line(data = paperback, colour = "black", aes(colour="black")) +
geom_line(data = fitted(fit1), colour = "blue", linetype = 2, aes(colour="blue")) +
geom_line(data = fitted(fit2), colour = "red", linetype = 2, aes(colour="red")) +
geom_line(data = fitted(fit3), colour = "green", linetype = 2, aes(colour = "green")) +
geom_line(data = fit1$mean, colour = "blue", linetype = 2) +
geom_line(data = fit2$mean, colour = "red", linetype = 2) +
geom_line(data = fit3$mean, colour = "green", linetype = 2)

I recommend to plot directly the forecast objects:
install.packages("fpp")
require("fpp"); require("books"); require("ggplot2"); require("ggfortify")
data("books")
paperback <- books[,1]
fit1 <- ses(paperback, alpha = 0.2, initial = "simple", h = 3)
fit2 <- ses(paperback, alpha = 0.6, initial = "simple", h = 3)
fit3 <- ses(paperback, h = 3)
par(mfrow = c(1,3))
plot(fit1)
plot(fit2)
plot(fit3)
and this produces:
and if you want to do it with ggplot you can do:
X <- cbind(model1 = fit1$mean, model2 = fit2$mean, model3 = fit3$mean)
df <- cbind(paperback, X)
colnames(df) <- c("paperback", "model1", "model2", "model3")
autoplot(df)
produces:

Related

How to manually change line size and alpha values for ggplot2 lines (separated by factor)?

I want to create a graph where I can change the line size for each line c(1,2,3) and the alpha values for each line c(0.5,0.6,0.7). I tried to use scale_size_manual but it didn't make any difference. Any ideas on how to proceed?
var <- c("T","T","T","M","M","M","A","A","A")
val <- rnorm(12,4,5)
x <- c(1:12)
df <- data.frame(var,val,x)
ggplot(aes(x= x , y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey","blue","black")) + geom_smooth(aes(x = x, y = val), formula = "y ~ x", method = "loess",se = FALSE, size = 1) + scale_x_continuous(breaks=seq(1, 12, 1), limits=c(1, 12)) + scale_size_manual(values = c(1,2,3))
To set the size and alpha values for your lines you have to map on aesthetics. Otherwise scale_size_manual will have no effect:
library(ggplot2)
ggplot(aes(x = x, y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey", "blue", "black")) +
geom_smooth(aes(x = x, y = val, size = var, alpha = var), formula = "y ~ x", method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1, 12, 1), limits = c(1, 12)) +
scale_size_manual(values = c(1, 2, 3)) +
scale_alpha_manual(values = c(.5, .6, .7))

how can i plot a line on bar chart in R

H <- c(1,2,4,1,0,0,3,1,3)
M <- c("one","two","three","four","five")
barplot(H,names.arg=M,xlab="number",ylab="random",col="blue",
main="bar chart",border="blue")
I want to add line on the bar chart , i don't know how to do it
like the one in blue
Maybe you want this?
hist(H, breaks=-1:4, freq=FALSE, xaxt="n")
axis(side=1, at=seq(-0.5, 3.5), labels=M)
lines(density(H))
The graph in the question can be made with code following the lines of:
1. Table the x vector.
tbl <- table(H)
df1 <- as.data.frame(tbl)
2. With package ggplot2, built-in ways of fitting a line can be used.
ggplot(df1, aes(as.integer(Var1), Freq)) +
geom_bar(stat = "identity", fill = "red", alpha = 0.5) +
geom_smooth(method = stats::loess,
formula = y ~ x,
color = "blue", fill = "blue",
alpha = 0.5)
Test data creation code.
set.seed(2020)
f <- function(x) sin(x)^2*exp(x)
p <- f(seq(0, 2.5, by = 0.05))
p <- p/sum(p)
H <- sample(51, size = 1e3, prob = p, replace = TRUE)
Edit
Here is a new graph, with the new data posted in comment. The data is at the end of this answer.
library(ggplot2)
library(scales)
Mdate <- as.Date(paste0(M, "/2020"), format = "%d/%m/%Y")
df1 <- data.frame(H, M = Mdate)
ggplot(df1, aes(M, H)) +
geom_bar(stat = "identity", fill = "red", alpha = 0.5) +
geom_smooth(method = stats::loess,
formula = y ~ x,
color = "blue", fill = "blue", alpha = 0.25,
level = 0.5, span = 0.1) +
scale_x_date(labels = date_format("%d/%m"))
New data
H <- c(1,2,4,1,0,0,3,1,3,3, 6,238,0,
58,17,64,38,3,10,8, 10,11,13,
7,25,11,12,13,28,44)
M <- c("29/02","01/03","02/03","03/03",
"04/03","05/03","06/03","07/03",
"08/03", "09/03","10/03","11/03",
"12/03","13/03","14/03","15/03",
"16/03", "17/03","18/03","19/03",
"20/03","21/03","22/03","23/03",
"24/03", "25/03","26/03","27/03",
"28/03","29/03")

Warning message 'mapping' is not used by stat_function() in R

While completing a project for understanding central limit theorem for exponential distribution, I ran into an annoying error message when plotting simulated vs theoretical distributions. When I run the code below, I get an error: 'mapping' is not used by stat_function().
By mapping I assume the error is referring to the aes parameter, which I later map to color red using scale_color_manual in order to show it in a legend.
My question is two-fold: why is this error happening? and is there a more efficient way to create a legend without using scale_color_manual?
Thank you!
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
It's not an error, it's a warning:
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
stat_function(fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
#> Warning: `mapping` is not used by stat_function()
Created on 2020-05-01 by the reprex package (v0.3.0)
You can suppress the warning by calling geom_line(stat = "function") rather than stat_function():
library(ggplot2)
lambda <- 0.2
n_sims <- 1000
set.seed(100100)
total_exp <- rexp(40 * n_sims, rate = lambda)
exp_data <- data.frame(
Mean = apply(matrix(total_exp, n_sims), 1, mean),
Vars = apply(matrix(total_exp, n_sims), 1, var)
)
g <- ggplot(data = exp_data, aes(x = Mean))
g +
geom_histogram(binwidth = .3, color = 'black', aes(y=..density..), fill = 'steelblue') +
geom_density(size=.5, aes(color = 'Simulation'))+
geom_line(stat = "function", fun = dnorm, mapping = aes(color='Theoretical'), args = list(mean = 1/lambda, sd = 1/lambda/sqrt(40)), size=.5, inherit.aes = F, show.legend = T)+
geom_text(x = 5.6, y = 0.1, label = "Theoretical and Sample Mean", size = 2, color = 'red') +
scale_color_manual("Legend", values = c('Theoretical' = 'red', 'Simulation' = 'blue')) +
geom_vline(aes(xintercept = 1/lambda), lwd = 1.5, color = 'grey') +
labs(x = 'Exponential Distribution Simulations Average Values') +
ggtitle('Sample Mean vs Theoretical Mean of the Averages of the Exponential Distribution')+
theme_classic(base_size = 10)
Created on 2020-05-01 by the reprex package (v0.3.0)
In my opinion, the warning is erroneous, and an issue has been filed about this problem: https://github.com/tidyverse/ggplot2/issues/3611
However, it's not that easy to solve, and therefore as of now the warning is there.
I'm unable to recreate your issue -- when I run your code a plot is generated (below), which suggests the issue is likely to do you with your environment. A general 'solution' is to clear your workspace using the menu dropdown or similar: Session -> Clear workspace..., then re-run your code.
For refactoring the color issue, you can simplify scale_color_manual to
scale_color_manual("Legend", values = c('blue','red')), but how it is now, is a bit better in my view. Anything beyond that has more to do with changing the data structure and mapping.
Apologies, I don't have the rep to make a comment.

Producing a "fuzzy" RD plot with ggplot2

My question is similar to this but the answers there will not work for me. Basically, I'm trying to produce a regression discontinuity plot with a "fuzzy" design that uses all the data for the treatment and control groups, but only plots the regression line within the "range" of the treatment and control groups.
Below, I've simulated some data and produced the fuzzy RD plot with base graphics. I'm hoping to replicate this plot with ggplot2. Note that the most important part of this is that the light blue regression line is fit using all the blue points, while the peach colored regression line is fit using all the red points, despite only being plotted over the ranges in which individuals were intended to receive treatment. That's the part I'm having a hard time replicating in ggplot.
I'd like to move to ggplot because I'd like to use faceting to produce this same plot across various units in which participants were nested. In the code below, I show a non-example using geom_smooth. When there's no fuzziness within a group, it works fine, but otherwise it fails. If I could get geom_smooth to be limited to only specific ranges, I think I'd be set. Any and all help is appreciated.
Simulate data
library(MASS)
mu <- c(0, 0)
sigma <- matrix(c(1, 0.7, 0.7, 1), ncol = 2)
set.seed(100)
d <- as.data.frame(mvrnorm(1e3, mu, sigma))
# Create treatment variable
d$treat <- ifelse(d$V1 <= 0, 1, 0)
# Introduce fuzziness
d$treat[d$treat == 1][sample(100)] <- 0
d$treat[d$treat == 0][sample(100)] <- 1
# Treatment effect
d$V2[d$treat == 1] <- d$V2[d$treat == 1] + 0.5
# Add grouping factor
d$group <- gl(9, 1e3/9)
Produce regression discontinuity plot with base
library(RColorBrewer)
pal <- brewer.pal(5, "RdBu")
color <- d$treat
color[color == 0] <- pal[1]
color[color == 1] <- pal[5]
plot(V2 ~ V1,
data = d,
col = color,
bty = "n")
abline(v = 0, col = "gray", lwd = 3, lty = 2)
# Fit model
m <- lm(V2 ~ V1 + treat, data = d)
# predicted achievement for treatment group
pred_treat <- predict(m,
newdata = data.frame(V1 = seq(-3, 0, 0.1),
treat = 1))
# predicted achievement for control group
pred_no_treat <- predict(m,
newdata = data.frame(V1 = seq(0, 4, 0.1),
treat = 0))
# Add predicted achievement lines
lines(seq(-3, 0, 0.1), pred_treat, col = pal[4], lwd = 3)
lines(seq(0, 4, 0.1), pred_no_treat, col = pal[2], lwd = 3)
# Add legend
legend("bottomright",
legend = c("Treatment", "Control"),
lty = 1,
lwd = 2,
col = c(pal[4], pal[2]),
box.lwd = 0)
non-example with ggplot
d$treat <- factor(d$treat, labels = c("Control", "Treatment"))
library(ggplot2)
ggplot(d, aes(V1, V2, group = treat)) +
geom_point(aes(color = treat)) +
geom_smooth(method = "lm", aes(color = treat)) +
facet_wrap(~group)
Notice the regression lines extending past the treatment range for groups 1 and 2.
There's probably a more graceful way to make the lines with geom_smooth, but it can be hacked together with geom_segment. Munge the data.frames outside of the plotting call if you like.
ggplot(d, aes(x = V1, y = V2, color = factor(treat, labels = c('Control', 'Treatment')))) +
geom_point(shape = 21) +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
geom_segment(data = data.frame(t(predict(m, data.frame(V1 = c(-3, 0), treat = 1)))),
aes(x = -3, xend = 0, y = X1, yend = X2), color = pal[4], size = 1) +
geom_segment(data = data.frame(t(predict(m, data.frame(V1 = c(0, 4), treat = 0)))),
aes(x = 0, xend = 4, y = X1, yend = X2), color = pal[2], size = 1)
Another option is geom_path:
df <- data.frame(V1 = c(-3, 0, 0, 4), treat = c(1, 1, 0, 0))
df <- cbind(df, V2 = predict(m, df))
ggplot(d, aes(x = V1, y = V2, color = factor(treat, labels = c('Control', 'Treatment')))) +
geom_point(shape = 21) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
geom_path(data = df, size = 1)
For the edit with facets, if I understand what you want correctly, you can calculate a model for each group with lapply and predict for each group. Here I'm recombine with dplyr::bind_rows instead of do.call(rbind, ...) for the .id parameter to insert the group number from the list element name, though there are other ways to do the same thing.
df <- data.frame(V1 = c(-3, 0, 0, 4), treat = c('Treatment', 'Treatment', 'Control', 'Control'))
m_list <- lapply(split(d, d$group), function(x){lm(V2 ~ V1 + treat, data = x)})
df <- dplyr::bind_rows(lapply(m_list, function(x){cbind(df, V2 = predict(x, df))}), .id = 'group')
ggplot(d, aes(x = V1, y = V2, color = treat)) +
geom_point(shape = 21) +
geom_vline(aes(xintercept = 0), color = 'grey', size = 1, linetype = 'dashed') +
geom_path(data = df, size = 1) +
scale_color_brewer(NULL, type = 'qual', palette = 6) +
facet_wrap(~group)

#What causes different behaviour between stats and ggplot2 when writing histograms, normal curves and qqplots to .pdf?

I need to produce plots for statistical analyses and I am stumped by a difference in behaviour between stats and ggplot. Who can help out?
I am trying to produce a pdf with histograms, including normal curves, side-by-side with qqplots, with the next plot continuing on the same page. Preferably using ggplot (because prettier plots). I have a large number of variables in my real dataset, so I am using a 'for' loop.
library(ggplot2)
library(stats)
library(datasets)
This piece of ggplot code does what I want it to do.
ggplot(airquality, aes(Wind)) +
geom_histogram(aes(y = ..density..),colour = "black", fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(airquality$Wind), sd = sd(airquality$Wind)), colour = "red", size = 1) +
xlab("Wind")
qplot(sample = airquality$Wind, stat = "qq")
I am fine with the binwidth warning, I want that picked automatically, and I will build in a suppression for that message later on. I am not sure wat to do though with: '"stat" is deprecated' Anyone?
If I try to work this into a 'for' loop, I cannot get it to work. It keeps putting every plot on a new page and it leaves out the normal curves:
Variablesairquality<-c("Wind", "Temp", "Month", "Day")
pdf(file = "Normality.pdf", 4, 5)
par(mfrow = c(2,2))
for(i in Variablesairquality){
plot(ggplot(airquality, aes(airquality[,i])) +
geom_histogram(aes(y = ..density..),colour = "black", fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(airquality[,i]), sd = sd(airquality[,i])), colour = "red", size = 1) +
xlab(i)
)
plot(qplot(sample = airquality[,i], stat = "qq" )
)
}
dev.off()
Which I don’t get, because if I try it using stats, it does exactly what I want:
pdf(file = "Normality2.pdf", 4, 5)
par(mfrow = c(2,2))
for(i in Variablesairquality){
h <- hist(airquality[,i], col = "white", cex.axis=0.50, xlab = i, cex.lab=0.75, main = paste("Distribution"), cex.main= 0.75)
xfit<-seq(min(airquality[,i]),max(airquality[,i]),length=length(airquality[,i]))
yfit<-dnorm(xfit,mean=mean(airquality[,i]),sd=sd(airquality[,i]))
yfit <- yfit*diff(h$mids[1:2])*length(airquality[,i])
lines(xfit, yfit, col="red", lwd=1)
qqnorm(airquality[,i], cex = 0.5, cex.axis=0.50, cex.lab=0.75, main = expression("Q-Q plot for"~paste(i)), cex.main= 0.75)
qqline(airquality[,i], col = "red")
}
dev.off()
(Accept for the thing with the main label which I still need to figure out. Anyone any tips?)
I would be most grateful if someone could point out the mistake in my ggplot code or otherwise explain this behaviour. Thanks!
I use R-programming V3.2.3 and R-studio v0.99.891. (And yes, I read every similar item here, scowered the internet and I read the help files; that did not get me where I need to go.)
On `stat` is deprecated, see Deprecated features in the ggplot2 2.0.0 release notes. Use instead:
ggplot(airquality, aes(sample = Wind)) +
stat_qq()
If you don't wish to use gridExtra::grid.arrange, here's an approach that uses facets. Begin by wrangling the data into a new dataframe with the values we want for x, y, plot type, and geom variables:
d <- as.data.frame(qqnorm(airquality$Wind, plot.it = F))
d$plot <- "QQ plot"
d$geom <- "point"
d <- rbind(d, data.frame(x = airquality$Wind, y = NA,
plot = "Histogram", geom = "bar"))
d <- rbind(d, with(airquality, data.frame(
x = seq(min(Wind), max(Wind), l = 100),
y = dnorm(seq(min(Wind), max(Wind), l = 100),
mean = mean(Wind), sd = sd(Wind)),
plot = "Histogram", geom = "line")))
Then call ggplot, subsetting the data as appropriate for each geom:
ggplot(d, aes(x = x, y = y)) + facet_wrap(~plot, scales = "free") +
geom_histogram(data = subset(d, plot == "Histogram" & geom == "bar"),
aes(y = ..density..),
colour = "black", fill = "white") +
geom_line(data = subset(d, plot == "Histogram" & geom == "line"),
colour = "red", size = 1) +
geom_point(data = subset(d, plot == "QQ plot")) +
labs(x = "Wind")
Output:
To do multiple plots, you can wrap the code above into a for loop, making sure to wrap ggplot inside print:
pdf("path/to/pdf/out.pdf")
Variablesairquality <- c("Wind", "Temp", "Month", "Day")
for (i in rev(Variablesairquality)) {
x <- airquality[[i]]
d <- as.data.frame(qqnorm(x, plot.it = F))
d$plot <- "QQ plot"
d$geom <- "point"
d <- rbind(d, data.frame(x = x, y = NA, plot = "Histogram", geom = "bar"))
d <- rbind(d, data.frame(x = seq(min(x), max(x), l = 100),
y = dnorm(seq(min(x), max(x), l = 100),
mean = mean(x), sd = sd(x)),
plot = "Histogram", geom = "line"))
print(
ggplot(d, aes(x = x, y = y)) + facet_wrap(~plot, scales = "free") +
geom_histogram(data = subset(d, plot == "Histogram" & geom == "bar"),
aes(y = ..density..),
colour = "black", fill = "white") +
geom_line(data = subset(d, plot == "Histogram" & geom == "line"),
colour = "red", size = 1) +
geom_point(data = subset(d, plot == "QQ plot")) +
labs(x = i)
)
}
dev.off()

Resources