Plot forecast and actual values - r

I all. I need some help from statistics expert. I have made a simple arima forecast for few values in as. But I have taken a subset of values in train_as.
Now is there a way to plot both actual values and forecasted values here. Like the actual values in 2019 is 4,8,12,16 and the forecast is 9,10,11,12. Can we plot this?
as <- data.frame(a=c(1,2,3,4,2,4,6,8,4,8,12,16))
train_as <- as[c(1:8),]
a1 <- ts(train_as,start = c(2017,1),end = c(2017,8),frequency = 4)
fit_arima <-auto.arima(a1, trace= TRUE, ic ="aic")
print(summary(fit_arima))
checkresiduals(fit_arima)
fcst <- forecast(fit_arima,h=4)
autoplot(fcst,include = 8)

This is easy to do using the forecast package with the autolayer() function.
library(forecast)
library(ggplot2)
as <- data.frame(a = c(1, 2, 3, 4, 2, 4, 6, 8, 4, 8, 12, 16))
# Convert to a time series
y <- ts(as$a, start = 2017, frequency = 4)
# Split in two
a1 <- subset(y, end = 8)
a2 <- subset(y, start = 9)
# Fit model
fit_arima <- auto.arima(a1, ic = "aic")
# Compute forecasts
fcst <- forecast(fit_arima, h = 4)
# Plot forecasts and test set
autoplot(fcst) + autolayer(a2)
Created on 2019-12-02 by the reprex package (v0.3.0)

You can try something like this, first you create your test dataset:
test_as <- as[c(9:12),]
Now a data.frame to plot, you can see the real data, the time, and the predicted values (and their ICs) that should be with the same length of the time and real data, so I pasted a NAs vector with length equal to the difference between the real data and the predicted, and the predicted (same for the ICs). Note the time is generated by quarter with zoo package:
library(zoo)
df <-
data.frame(real = as$a,
pred = c(rep(NA,length(as$a)-length(data.frame(fcst)[,1])),data.frame(fcst)[,1]),
time = zoo::as.yearqtr(seq(as.Date("2017/1/1"), as.Date("2019/12/1"), by = "quarter"), format = "%Y-%m-%d"),
Lo80 =c(rep(NA,length(as$a)-length(data.frame(fcst)[,2])),data.frame(fcst)[,2]),
Hi80 =c(rep(NA,length(as$a)-length(data.frame(fcst)[,3])),data.frame(fcst)[,3]),
Lo95 =c(rep(NA,length(as$a)-length(data.frame(fcst)[,4])),data.frame(fcst)[,4]),
Hi95 =c(rep(NA,length(as$a)-length(data.frame(fcst)[,5])),data.frame(fcst)[,5]))
Now you can plot it:
library(ggplot2)
ggplot(df, aes(time, pred, group = 1)) +
geom_line() +
geom_line(aes(time, real, group = 1), color = "red")+
geom_ribbon(aes(time, ymin = Lo95, ymax = Hi95), fill = "red", alpha = 0.25) +
geom_ribbon(aes(time, ymin = Lo80, ymax = Hi80), fill = "red", alpha = 0.25) +
theme_light()

Related

ggplot2 adding label to geom_area

I'm teaching undergrad statistics and trying to make a useful little R script to help my students understand calculating probabilities in the standard normal distribution. I have this script, which takes zscore breakpoints, calculates the fraction of data between each breakpoint, and colors each breakpoint section:
library(tidyverse)
library(ggplot2)
library(magrittr)
sim_dat = data.frame(z = seq(-5,5, length.out = 1001))
sim_dat$y = dnorm(sim_dat$z, mean = 0, sd=1)
#fill in z-score bkpts, excluding zero: 0 will always be included
zscores <- c(-1,1.5)
zscores <- sort( setdiff(zscores,0) )
bkpoints <- sort( c(-Inf, zscores,0, Inf))
#find pct data between brekpoints
pctdata <- numeric(length=length(bkpoints)-1)
interval <- character(length=length(bkpoints)-1)
for(i in 1:length(pctdata)){
pctdata[i] <- plyr::round_any( pnorm(q=bkpoints[i+1]) - pnorm(q=bkpoints[i]) , 0.0001)
interval[i] <- paste0(bkpoints[i],",",bkpoints[i+1])
}
pctdata_df <- cbind.data.frame(interval,pctdata,stringsAsFactors=FALSE)
sim_dat$standard_normal_sections = cut(sim_dat$z, breaks = bkpoints)
p1 <- ggplot2::ggplot(sim_dat, aes(z, y, fill = standard_normal_sections)) + geom_area() +
scale_x_continuous(breaks= c(seq(-5,5,1), zscores))
p1
pctdata_df
I'd like to use pctdata_df$pctdata(vector of how much data is in section of p1) as labels. I'm finding very little on how to add labels to geom_area. Any help is appreciated!
There is nothing special about geom_area. If you want to add labels you could do so with geom_text where you pass your pctdata_df to the data argument. As you gave no information on where you want to add your labels I have put them beneath the area chart.
Note: There is no need for a for loop. You could simply pass a vector to pnorm or paste.
library(scales)
library(ggplot2)
# find pct data between brekpoints
lower <- bkpoints[1:(length(bkpoints) - 1)]
upper <- bkpoints[2:length(bkpoints)]
pctdata <- pnorm(q = upper) - pnorm(q = lower)
interval <- paste0(lower, ",", upper)
pctdata_df <- data.frame(interval, lower, upper, pctdata)
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(lower), upper - 1, .5 * (lower + upper)))
pctdata_df$x_label <- with(pctdata_df, ifelse(is.infinite(upper), lower + 1, x_label))
sim_dat$standard_normal_sections <- cut(sim_dat$z, breaks = bkpoints)
ggplot(sim_dat, aes(z, y)) +
geom_area(aes(fill = standard_normal_sections)) +
geom_text(data = pctdata_df, aes(x = x_label, y = 0, label = scales::number(pctdata, .01)),
vjust = 1, size = 8 / .pt, nudge_y = -.01) +
scale_x_continuous(breaks = c(seq(-5, 5, 1), zscores))

How to do a non-linear regression using geom_smooth when start values are separated by category in a different data frame?

I have 2 data frames: one with experimental data that need to be fitted to a non-linear model and another with the starting values for fitting by the nls method. Both experimental data and starting values are separated into categories a and b. I want to make a graph using ggplot2 that shows the curve fitted to the points and separated by category, but I can't indicate the starting values, which are in another data frame, for each category.
In MWE, I present the data frame with the starting values in two ways: 1. each column is a category, or 2. each row is a category. (see Constants1 and Constants2 objects). I thought this organization was relevant to call the values in ggplot
library(ggplot2)
Category <- c("a", "b")
k1 <- c(10, 20)
k2 <- c(0.01, 0.02)
Constants1 <- data.frame(Category, k1, k2)
Constants2 <- data.frame(rbind(k1, k2))
colnames(Constants2) <- Category
x <- seq(0,100,20)
y <- c(0, 2, 3.5, 4.5, 5.5, 6,
0, 7, 11, 14, 16, 17)
df <- expand.grid(x = x,
Category = Category)
df$y <- y
ggplot(data = df,
aes(x = x,
y = y)) +
geom_point(aes(shape = Category)) +
geom_smooth(aes(linetype = Category),
formula = y ~ k1*(1-exp((-k2)*x)),
method.args = list(start = list(k1 = ??, #Help here
k2 = ??)),
se = FALSE,
method = "nls")
Maybe this is what you are looking for. Instead of making use of just one geom_smooth you could add one for each combination of starting values. To this end I make use of purrr::pmap to loop over the data frame with the starting values to create a list of geom_smooth layers which could then be added to the ggplot:
library(ggplot2)
library(purrr)
layer_smooth <- pmap(Constants1, function(...) {
args <- list(...)
geom_smooth(aes(linetype = Category),
formula = y ~ k1*(1-exp((-k2)*x)),
method.args = list(start = list(k1 = args$k1, #Help here
k2 = args$k2)),
se = FALSE,
method = "nls")
})
ggplot(data = df,
aes(x = x,
y = y)) +
geom_point(aes(shape = Category)) +
layer_smooth

How do I propagate the error of a linear regression when projecting from Y to X?

I'm trying to figure out how to propagate errors in the following case
I am calibrating a machine with a couple of standards (a, b, c) with
accepted values x. My machine measures y for these standards, with a
certain error (standard deviation of 1 in this example).
Then I measure replicates of a sample, yielding ynew. Now I want to
convert these values to the accepted measurement scale (the x-axis).
To do this, I can of course do some linear algebra and convert the slope and
intercept that I got from my standard measurements to a reversed slope and
intercept as follows
This works nicely to convert the input values, but how do I get proper estimates of the errors?
In R, I've tried the following:
library(broom) # for tidy lm
library(ggplot2) # for plotting
library(dplyr) # to allow piping
# find confidence value
cv <- function(x, level = 95) {
qt(1 - ((100 - level) / 100) / 2, df = length(x) - 1) * sd(x) / sqrt(length(x))
}
# find confidence interval
ci <- function(x, level = 95) {
xbar <- mean(x)
xci <- cv(x, level = level)
c(fit = xbar, lwr = xbar - xci, upr = xbar + xci)
}
set.seed(1337)
# create fake data
dat <- data.frame(id = rep(letters[1:3], 20),
x = rep(c(1, 7, 10), 20)) %>%
mutate(y = rnorm(n(), -20 + 1.25 * x, 1))
# generate linear model
mod <- lm(y ~ x, dat)
# tidy
mod_aug <- augment(mod)
# these are the new samples that my machine measures
ynew <- rnorm(10, max(dat$y) + 3)
# predict new x-value based on y-value that is outside of range
## predict(mod, newdata = data.frame(y = ynew), interval = "predict")
# Error in eval(predvars, data, env) : object 'x' not found
# or tidy
## augment(mod, newdata = data.frame(y = ynew))
# 50 row df that doesn't make sense
# found this function that should do the job, but it doesn't extrapolate
## approx(x = mod$fitted.values, y = dat$x, xout = ynew)$y
# [1] NA NA NA NA NA NA NA NA NA NA
# this one from Hmisc does allow for extrapolation
with_approx <- Hmisc::approxExtrap(x = mod_aug$.fitted, y = mod_aug$x, xout = ynew)$y
# but in case of lm, isn't using the slope and intercept of a model okay too?
with_itc_slp <- (- coef(mod)[1] / coef(mod)[2]) + (1 / coef(mod)[2] * ynew)
# this would be the 95% prediction interval of the model at the average
# sample position. Could also use "confidence" but this is more correct?
avg_prediction <- predict(mod,
newdata = data.frame(x = mean(with_itc_slp)),
interval = "prediction")
# plot it
ggplot(dat, aes(x = x, y = y, col = id)) +
geom_point() +
geom_hline(yintercept = ynew, col = "gray") +
geom_smooth(aes(group = 1), method = "lm", se = F, fullrange = T,
col = "lightblue") +
geom_smooth(aes(group = 1), method = "lm") +
# 95% CI of the new sample
annotate("pointrange", x = 1, y = mean(ynew),
ymin = ci(ynew)[2], ymax = ci(ynew)[3], col = "green") +
# 95% prediction interval of the linear model at the average transformed
# x-position
annotate("pointrange", x = mean(with_approx), y = mean(ynew),
ymin = avg_prediction[2], ymax = avg_prediction[3], col = "green") +
# transformed using approx
annotate("point", x = with_approx, y = ynew, size = 3, col = "blue",
shape = 1) +
# transformed using intercept and slope
annotate("point", x = with_itc_slp, y = ynew, size = 3, col = "red",
shape = 2) +
# it's pretty
coord_fixed()
resulting in this plot:
Now how do I go from these 95% CIs in the y-direction to transformed sample
x-values with a confidence interval in the x-direction?

Forecast with ggplot2 and funggcast function

On this website, Mr. Davenport published a function to plot an arima forecast with ggplot2 on the example of an arbitrary dataset, he published here. I can follow his example without any error message.
Now, when I use my data, I would end with the warning:
1: In window.default(x, ...) : 'end' value not changed
2: In window.default(x, ...) : 'end' value not changed
I know that it happens when I call this command pd <- funggcast(yt, yfor) due to an issue with the data I indicate in my data end = c(2013). But I do not know how to fix that.
This is the code I use:
library(ggplot2)
library(zoo)
library(forecast)
myts <- ts(rnorm(55), start = c(1960), end = c(2013), freq = 1)
funggcast <- function(dn, fcast){
en <- max(time(fcast$mean)) # Extract the max date used in the forecast
# Extract Source and Training Data
ds <- as.data.frame(window(dn, end = en))
names(ds) <- 'observed'
ds$date <- as.Date(time(window(dn, end = en)))
# Extract the Fitted Values (need to figure out how to grab confidence intervals)
dfit <- as.data.frame(fcast$fitted)
dfit$date <- as.Date(time(fcast$fitted))
names(dfit)[1] <- 'fitted'
ds <- merge(ds, dfit, all.x = T) # Merge fitted values with source and training data
# Extract the Forecast values and confidence intervals
dfcastn <- as.data.frame(fcast)
dfcastn$date <- as.Date(as.yearmon(row.names(dfcastn)))
names(dfcastn) <- c('forecast','lo80','hi80','lo95','hi95','date')
pd <- merge(ds, dfcastn,all.x = T) # final data.frame for use in ggplot
return(pd)
}
yt <- window(myts, end = c(2013)) # extract training data until last year
yfit <- auto.arima(myts) # fit arima model
yfor <- forecast(yfit) # forecast
pd <- funggcast(yt, yfor) # extract the data for ggplot using function funggcast()
ggplot(data = pd, aes(x = date,y = observed)) + geom_line(color = "red") + geom_line(aes(y = fitted), color = "blue") + geom_line(aes(y = forecast)) + geom_ribbon(aes(ymin = lo95, ymax = hi95), alpha = .25) + scale_x_date(name = "Time in Decades") + scale_y_continuous(name = "GDP per capita (current US$)") + theme(axis.text.x = element_text(size = 10), legend.justification=c(0,1), legend.position=c(0,1)) + ggtitle("Arima(0,1,1) Fit and Forecast of GDP per capita for Brazil (1960-2013)") + scale_color_manual(values = c("Blue", "Red"), breaks = c("Fitted", "Data", "Forecast"))
Edit: I found another blog here with a function to use with forecast and ggplot2 but I would like to use the approach above, if I were able to find my mistake. Anyone?
Edit2:
If I run your updated code with my data here, than I get the graph down below. Note that I did not change the end = c(2023) for mtys, otherwise it would not merge the forecasted with the fitted value.
myts <- ts(WDI_gdp_capita$Brazil, start = c(1960), end = c(2023), freq = 1)
funggcast <- function(dn, fcast){
en <- max(time(fcast$mean)) # Extract the max date used in the forecast
# Extract Source and Training Data
ds <- as.data.frame(window(dn, end = en))
names(ds) <- 'observed'
ds$date <- as.Date(time(window(dn, end = en)))
# Extract the Fitted Values (need to figure out how to grab confidence intervals)
dfit <- as.data.frame(fcast$fitted)
dfit$date <- as.Date(time(fcast$fitted))
names(dfit)[1] <- 'fitted'
ds <- merge(ds, dfit, all = T) # Merge fitted values with source and training data
# Extract the Forecast values and confidence intervals
dfcastn <- as.data.frame(fcast)
dfcastn$date <- as.Date(paste(row.names(dfcastn),"01","01",sep="-"))
names(dfcastn) <- c('forecast','lo80','hi80','lo95','hi95','date')
pd <- merge(ds, dfcastn,all.x = T) # final data.frame for use in ggplot
return(pd)
} # ggplot function by Frank Davenport
yt <- window(myts, end = c(2013)) # extract training data until last year
yfit <- auto.arima(yt) # fit arima model
yfor <- forecast(yfit) # forecast
pd <- funggcast(myts, yfor) # extract the data for ggplot using function funggcast()
ggplot(data = pd, aes(x = date, y = observed)) + geom_line(color = "red") + geom_line(aes(y = fitted), color = "blue") + geom_line(aes(y = forecast)) + geom_ribbon(aes(ymin = lo95, ymax = hi95), alpha = .25) + scale_x_date(name = "Time in Decades") + scale_y_continuous(name = "GDP per capita (current US$)") + theme(axis.text.x = element_text(size = 10), legend.justification=c(0,1), legend.position=c(0,1)) + ggtitle("Arima(0,1,1) Fit and Forecast of GDP per capita for Brazil (1960-2013)") + scale_color_manual(values = c("Blue", "Red"), breaks = c("Fitted", "Data", "Forecast")) + ggsave((filename = "gdp_forecast_ggplot.pdf"), width=330, height=180, units=c("mm"), dpi = 300, limitsize = TRUE)
The almost perfect graph I get:
One additional question: How can I get a legend in this graph?
If I set end = c(2013) for myts, I get the same graph as in the beginning:
There are several points that are different between Mr Davenport's analysis and the plot you are trying to make.
The first one is that he is comparing the the arima forecast to some observed data, which is why he trains the model on a portion of the whole time series, the training set.
To do this, you should make your initial time series longer:
myts <- ts(rnorm(55), start = c(1960), end = c(2023), freq = 1)
Then at the end of your script, where you select the training up to 2013:
yt <- window(myts, end = c(2013)) # extract training data until last year
The model should be trained on the training set, not the whole time series, so you should change the yfit line to:
yfit <- auto.arima(yt) # fit arima model
And call the funggcast function using the whole time series, because it needs the observed and fitted data:
pd <- funggcast(myts, yfor)
Finally, he uses dates that have month and year, so in his funggcast function, change this line:
dfcastn$date <- as.Date(as.yearmon(row.names(dfcastn)))
To:
dfcastn$date <- as.Date(paste(row.names(dfcastn),"01","01",sep="-"))
This is because the values predicted by the model need to be changed to dates, like 2014 has to be changed to 2014-01-01, in order to be merged with the observed data.
After all the changes, the code looks like this:
library(ggplot2)
library(zoo)
library(forecast)
myts <- ts(rnorm(55), start = c(1960), end = c(2013), freq = 1)
funggcast <- function(dn, fcast){
en <- max(time(fcast$mean)) # Extract the max date used in the forecast
# Extract Source and Training Data
ds <- as.data.frame(window(dn, end = en))
names(ds) <- 'observed'
ds$date <- as.Date(time(window(dn, end = en)))
# Extract the Fitted Values (need to figure out how to grab confidence intervals)
dfit <- as.data.frame(fcast$fitted)
dfit$date <- as.Date(time(fcast$fitted))
names(dfit)[1] <- 'fitted'
ds <- merge(ds, dfit, all.x = T) # Merge fitted values with source and training data
# Extract the Forecast values and confidence intervals
dfcastn <- as.data.frame(fcast)
dfcastn$date <- as.Date(paste(row.names(dfcastn),"01","01",sep="-"))
names(dfcastn) <- c('forecast','lo80','hi80','lo95','hi95','date')
pd <- merge(ds, dfcastn,all= T) # final data.frame for use in ggplot
return(pd)
}
yt <- window(myts, end = c(2013)) # extract training data until last year
yfit <- auto.arima(yt) # fit arima model
yfor <- forecast(yfit) # forecast
pd <- funggcast(myts, yfor) # extract the data for ggplot using function funggcast()
plotData<-ggplot(data = pd, aes(x = date, y = observed)) + geom_line(aes(color = "1")) +
geom_line(aes(y = fitted,color="2")) +
geom_line(aes(y = forecast,color="3")) +
scale_colour_manual(values=c("red", "blue","black"),labels = c("Observed", "Fitted", "Forecasted"),name="Data")+
geom_ribbon(aes(ymin = lo95, ymax = hi95), alpha = .25)+
scale_x_date(name = "Time in Decades") +
scale_y_continuous(name = "GDP per capita (current US$)")+
theme(axis.text.x = element_text(size = 10)) +
ggtitle("Arima(0,1,1) Fit and Forecast of GDP per capita for Brazil (1960-2013)")
plotData
And you get a plot that looks like this, the fitting is pretty bad with a completely random time series. Also ggplot will output some errors because the forecast line has no data before 2013 and the fitted data does not go on after 2013. (I ran it several times, depending on the initial, random time series, the model might just predict 0 everywhere)
Edit: changed the pd assignment line as well, in case there are no observed data after 2013
Edit2: I changed the ggplot function at the end of the code to make sure the legend shows up
There is a package called ggfortify available via github which allows straight plotting of forecast objects with ggplot2. It can be found on http://rpubs.com/sinhrks/plot_ts
This is a bump on a rather old post, but there's a fuction in github that produces some nice results.
Here's the code as it was on Aug 03, 2016:
function(forec.obj, data.color = 'blue', fit.color = 'red', forec.color = 'black',
lower.fill = 'darkgrey', upper.fill = 'grey', format.date = F)
{
serie.orig = forec.obj$x
serie.fit = forec.obj$fitted
pi.strings = paste(forec.obj$level, '%', sep = '')
if(format.date)
dates = as.Date(time(serie.orig))
else
dates = time(serie.orig)
serie.df = data.frame(date = dates, serie.orig = serie.orig, serie.fit = serie.fit)
forec.M = cbind(forec.obj$mean, forec.obj$lower[, 1:2], forec.obj$upper[, 1:2])
forec.df = as.data.frame(forec.M)
colnames(forec.df) = c('forec.val', 'l0', 'l1', 'u0', 'u1')
if(format.date)
forec.df$date = as.Date(time(forec.obj$mean))
else
forec.df$date = time(forec.obj$mean)
p = ggplot() +
geom_line(aes(date, serie.orig, colour = 'data'), data = serie.df) +
geom_line(aes(date, serie.fit, colour = 'fit'), data = serie.df) +
scale_y_continuous() +
geom_ribbon(aes(x = date, ymin = l0, ymax = u0, fill = 'lower'), data = forec.df, alpha = I(0.4)) +
geom_ribbon(aes(x = date, ymin = l1, ymax = u1, fill = 'upper'), data = forec.df, alpha = I(0.3)) +
geom_line(aes(date, forec.val, colour = 'forecast'), data = forec.df) +
scale_color_manual('Series', values=c('data' = data.color, 'fit' = fit.color, 'forecast' = forec.color)) +
scale_fill_manual('P.I.', values=c('lower' = lower.fill, 'upper' = upper.fill))
if (format.date)
p = p + scale_x_date()
p
}

R Language - Sorting data into ranges; averaging; ignore outliers

I am analyzing data from a wind turbine, normally this is the sort of thing I would do in excel but the quantity of data requires something heavy-duty. I have never used R before and so I am just looking for some pointers.
The data consists of 2 columns WindSpeed and Power, so far I have arrived at importing the data from a CSV file and scatter-plotted the two against each other.
What I would like to do next is to sort the data into ranges; for example all data where WindSpeed is between x and y and then find the average of power generated for each range and graph the curve formed.
From this average I want recalculate the average based on data which falls within one of two standard deviations of the average (basically ignoring outliers).
Any pointers are appreciated.
For those who are interested I am trying to create a graph similar to this. Its a pretty standard type of graph but like I said the shear quantity of data requires something heavier than excel.
Since you're no longer in Excel, why not use a modern statistical methodology that doesn't require crude binning of the data and ad hoc methods to remove outliers: locally smooth regression, as implemented by loess.
Using a slight modification of csgillespie's sample data:
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
plot(w_sp, power)
x_grid <- seq(0, 100, length = 100)
lines(x_grid, predict(loess(power ~ w_sp), x_grid), col = "red", lwd = 3)
Throw this version, similar in motivation as #hadley's, into the mix using an additive model with an adaptive smoother using package mgcv:
Dummy data first, as used by #hadley
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
df <- data.frame(power = power, w_sp = w_sp)
Fit the additive model using gam(), using an adaptive smoother and smoothness selection via REML
require(mgcv)
mod <- gam(power ~ s(w_sp, bs = "ad", k = 20), data = df, method = "REML")
summary(mod)
Predict from our model and get standard errors of fit, use latter to generate an approximate 95% confidence interval
x_grid <- with(df, data.frame(w_sp = seq(min(w_sp), max(w_sp), length = 100)))
pred <- predict(mod, x_grid, se.fit = TRUE)
x_grid <- within(x_grid, fit <- pred$fit)
x_grid <- within(x_grid, upr <- fit + 2 * pred$se.fit)
x_grid <- within(x_grid, lwr <- fit - 2 * pred$se.fit)
Plot everything and the Loess fit for comparison
plot(power ~ w_sp, data = df, col = "grey")
lines(fit ~ w_sp, data = x_grid, col = "red", lwd = 3)
## upper and lower confidence intervals ~95%
lines(upr ~ w_sp, data = x_grid, col = "red", lwd = 2, lty = "dashed")
lines(lwr ~ w_sp, data = x_grid, col = "red", lwd = 2, lty = "dashed")
## add loess fit from #hadley's answer
lines(x_grid$w_sp, predict(loess(power ~ w_sp, data = df), x_grid), col = "blue",
lwd = 3)
First we will create some example data to make the problem concrete:
w_sp = sample(seq(0, 100, 0.01), 1000)
power = 1/(1+exp(-(rnorm(1000, mean=w_sp, sd=5) -40)/5))
Suppose we want to bin the power values between [0,5), [5,10), etc. Then
bin_incr = 5
bins = seq(0, 95, bin_incr)
y_mean = sapply(bins, function(x) mean(power[w_sp >= x & w_sp < (x+bin_incr)]))
We have now created the mean values between the ranges of interest. Note, if you wanted the median values, just change mean to median. All that's left to do, is to plot them:
plot(w_sp, power)
points(seq(2.5, 97.5, 5), y_mean, col=3, pch=16)
To get the average based on data that falls within two standard deviations of the average, we need to create a slightly more complicated function:
noOutliers = function(x, power, w_sp, bin_incr) {
d = power[w_sp >= x & w_sp < (x + bin_incr)]
m_d = mean(d)
d_trim = mean(d[d > (m_d - 2*sd(d)) & (d < m_d + 2*sd(d))])
return(mean(d_trim))
}
y_no_outliers = sapply(bins, noOutliers, power, w_sp, bin_incr)
Here are some examples of fitted curves (weibull analysis) for commercial turbines:
http://www.inl.gov/wind/software/
http://www.irec.cmerp.net/papers/WOE/Paper%20ID%20161.pdf
http://www.icaen.uiowa.edu/~ie_155/Lecture/Power_Curve.pdf
I'd recommend also playing around with Hadley's own ggplot2. His website is a great resource: http://had.co.nz/ggplot2/ .
# If you haven't already installed ggplot2:
install.pacakges("ggplot2", dependencies = T)
# Load the ggplot2 package
require(ggplot2)
# csgillespie's example data
w_sp <- sample(seq(0, 100, 0.01), 1000)
power <- 1/(1+exp(-(w_sp -40)/5)) + rnorm(1000, sd = 0.1)
# Bind the two variables into a data frame, which ggplot prefers
wind <- data.frame(w_sp = w_sp, power = power)
# Take a look at how the first few rows look, just for fun
head(wind)
# Create a simple plot
ggplot(data = wind, aes(x = w_sp, y = power)) + geom_point() + geom_smooth()
# Create a slightly more complicated plot as an example of how to fine tune
# plots in ggplot
p1 <- ggplot(data = wind, aes(x = w_sp, y = power))
p2 <- p1 + geom_point(colour = "darkblue", size = 1, shape = "dot")
p3 <- p2 + geom_smooth(method = "loess", se = TRUE, colour = "purple")
p3 + scale_x_continuous(name = "mph") +
scale_y_continuous(name = "power") +
opts(title = "Wind speed and power")

Resources