Compute smoothed mean on time series - r

I have got a data.frame where one column represents dates in years and the other column observations of e.g. sea level in mm.
I need to calculate the 10-year smoothed mean.
Here some fake data:
x = rnorm(1:100) #annual sea leavel rise
date = seq(1801,1900) #years from 1801 to 1900
df = data.frame(date,x) #create data.frame
Is there any R function that could help?
Is the smoothed mean the same as the moving average?
Thanks for any help and/or suggestion

The moving average is just the simplest case of the smoothed mean, which is widely used in the climate science. The R filter function which may be quite a convenient way to resolve your issue
# sample data
x <- rnorm(1:100)
date <- seq(1801,1900)
df <- data.frame(date,x)
# coefficients for moving average are the simplest ones
f10 <- rep(1/10,10)
df[,"x_10ma"] <- filter(df$x, f10, sides = 1)
# fast check
plot(x = df$date, y = df$x, col="red")
points(x = df$date, y = df$x_10ma,col="blue")
More advanced smoothing options are provided, e.g. by the 'TTR' or 'smooth' packages.

Related

Moving average on several time series using ggplot

Hi I try desperately to plot several time series with a 12 months moving average.
Here is an example with two time series of flower and seeds densities. (I have much more time series to work on...)
#datasets
taxon <- c(rep("Flower",36),rep("Seeds",36))
density <- c(seq(20, 228, length=36),seq(33, 259, length=36))
year <- rep(c(rep("2000",12),rep("2001",12),rep("2002",12)),2)
ymd <- c(rep(seq(ymd('2000-01-01'),ymd('2002-12-01'), by = 'months'),2))
#dataframe
df <- data.frame(taxon, density, year, ymd)
library(forecast)
#create function that does a Symmetric Weighted Moving Average (2x12) of the monthly log density of flowers and seeds
ma_12 <- function(x) {
ts_x <- ts(x, freq = 12, start = c(2000, 1), end = c(2002, 12)) # transform to time-series object as it is necessary to run the ma function
return(ma(log(ts_x + 1), order = 12, centre = T))
}
#trial of the function
ma_12(df[df$taxon=="Flower",]$density) #works well
library(ggplot2)
#Trying to plot flower and seeds log density as two time series
ggplot(df,aes(x=year,y=density,colour=factor(taxon),group=factor(taxon))) +
stat_summary(fun.y = ma_12, geom = "line") #or geom = "smooth"
#Warning message:
#Computation failed in `stat_summary()`:
#invalid time series parameters specified
Function ma_12 works correctly. The problem comes when I try to plot both time-series (Flower and Seed) using ggplot. I cannot define both taxa as different time series and apply a moving average on them. Seems that it has to do with "stat_summary"...
Any help would be more than welcome! Thanks in advance
Note: The following link is quite useful but can not directly help me as I want to apply a specific function and plot it in accordance to the levels of one group variable. For now, I can't find any solution. Any way, thank you to suggest me this.
Multiple time series in one plot
This is what you need?
f <- ma_12(df[df$taxon=="Flower", ]$density)
s <- ma_12(df[df$taxon=="Seeds", ]$density)
f <- cbind(f,time(f))
s <- cbind(s,time(s))
serie <- data.frame(rbind(f,s),
taxon=c(rep("Flower", dim(f)[1]), rep("Seeds", dim(s)[1])))
serie$density <- exp(serie$f)
library(lubridate)
serie$time <- ymd(format(date_decimal(serie$time), "%Y-%m-%d"))
library(ggplot2)
ggplot() + geom_point(data=df, aes(x=ymd, y=density, color=taxon, group=taxon)) +
geom_line(data=serie, aes(x= time, y=density, color=taxon, group=taxon))

Plotting confidence intervals in ggplot (from a matrix)

I have simulated the enviromental damage on two economic sectors of a small region using basic input-output analysis.
I have plotted the average damage and its confidence interval by sector each year (I have a 10-year period) as reported in the code below, using matrix.
I would like to get a similar graph using ggplot.
For now, I have decided to omit the code corresponding to the data set up and the simulations to make the question as concise as possible, please, let me know if I should include it.
Thank you in advance for your help
# Average drop in each iteration
medias=t(apply(vX,2,function(x) apply(x,2,mean)))
# std deviations
devtip=t(apply(vX,2,function(x) apply(x,2,sd)))
devtip
# and their confidence intervals
inter95=t(apply(vX,2,function(x) apply(x,2,quantile,p=c(0.025,0.975))))
# where the first two columns are the interval for the first sector
# where the second two columns are the interval for the first sector
inter95[,1:2] # ci for the first sector
inter95[,3:4] # ci for the second sector
# Plots the drop in demandfor each sector each year and its CI
matplot(medias[,],type="l",lty=1,lwd=1,ylab="Variación de la producción en Media",xlab="Tiempo (Iteracción)",ylim=range(inter95))
for(sec in 1:length(sec.int)){
inter=apply(vX[,,sec],2,quantile,p=c(0.025,0.975))
segments(x0=1:N,y0=inter[1,],y1=inter[2,],col=(1:length(sec.int))[sec])
}
legend("right",paste("Sec.",sec.int),col=1:length(sec.int),bty="n",lty=1)
Firstly, please provide some reproducible data. And I think you question has been answered here.
Assuming that in your example medias is a matrix with ncol= 2 for the both trend means and inter95 another matrix with ncol= 4, saving the confidence intervals, I would do:
df <- cbind.data.frame(medias, inter95)
names(df) <- c("mean1", "mean2", "lwr1", "upr1", "lwr2", "upr2")
df$time <- 1:n
ggplot(df, aes(time, mean1)) +
geom_line() +
geom_ribbon(data= df,aes(ymin= lwr1,ymax= upr1),alpha=0.3) +
geom_line(aes(time, mean2), col= "red") +
geom_ribbon(aes(ymin= lwr2,ymax= upr2),alpha=0.3, fill= "red")
Using this data
set.seed(1)
n <- 10
b <- .5
medias <- matrix(rnorm(n*2), ncol= 2)
inter95 <- matrix(c(medias[ , 1]-b, medias[, 1]+b, medias[ , 2]-b, medias[ , 2]+b), ncol= 4)
gives you the following plot
plot

starting a daily time series in R

I have a daily time series about number of visitors on the web site. my series start from 01/06/2014 until today 14/10/2015 so I wish to predict number of visitor for in the future. How can I read my series with R? I'm thinking:
series <- ts(visitors, frequency=365, start=c(2014, 6))
if yes,and after runing my time series model arimadata=auto.arima() I want to predict visitor's number for the next 6o days, how can i do this?
h=..?
forecast(arimadata,h=..),
the value of h shoud be what ?
thanks in advance for your help
The ts specification is wrong; if you are setting this up as daily observations, then you need to specify what day of the year 2014 is June 1st and specify this in start:
## Create a daily Date object - helps my work on dates
inds <- seq(as.Date("2014-06-01"), as.Date("2015-10-14"), by = "day")
## Create a time series object
set.seed(25)
myts <- ts(rnorm(length(inds)), # random data
start = c(2014, as.numeric(format(inds[1], "%j"))),
frequency = 365)
Note that I specify start as c(2014, as.numeric(format(inds[1], "%j"))). All the complicated bit is doing is working out what day of the year June 1st is:
> as.numeric(format(inds[1], "%j"))
[1] 152
Once you have this, you're effectively there:
## use auto.arima to choose ARIMA terms
fit <- auto.arima(myts)
## forecast for next 60 time points
fore <- forecast(fit, h = 60)
## plot it
plot(fore)
That seems suitable given the random data I supplied...
You'll need to select appropriate arguments for auto.arima() as suits your data.
Note that the x-axis labels refer to 0.5 (half) of a year.
Doing this via zoo
This might be easier to do via a zoo object created using the zoo package:
## create the zoo object as before
set.seed(25)
myzoo <- zoo(rnorm(length(inds)), inds)
Note you now don't need to specify any start or frequency info; just use inds computed earlier from the daily Date object.
Proceed as before
## use auto.arima to choose ARIMA terms
fit <- auto.arima(myts)
## forecast for next 60 time points
fore <- forecast(fit, h = 60)
The plot though will cause an issue as the x-axis is in days since the epoch (1970-01-01), so we need to suppress the auto plotting of this axis and then draw our own. This is easy as we have inds
## plot it
plot(fore, xaxt = "n") # no x-axis
Axis(inds, side = 1)
This only produces a couple of labeled ticks; if you want more control, tell R where you want the ticks and labels:
## plot it
plot(fore, xaxt = "n") # no x-axis
Axis(inds, side = 1,
at = seq(inds[1], tail(inds, 1) + 60, by = "3 months"),
format = "%b %Y")
Here we plot every 3 months.
Time Series Object does not work well with creating daily time series. I will suggest you use the zoo library.
library(zoo)
zoo(visitors, seq(from = as.Date("2014-06-01"), to = as.Date("2015-10-14"), by = 1))
Here's how I created a time series when I was given some daily observations with quite a few observations missing. #gavin-simpson gave quite a big help. Hopefully this saves someone some grief.
The original data looked something like this:
library(lubridate)
set.seed(42)
minday = as.Date("2001-01-01")
maxday = as.Date("2005-12-31")
dates <- seq(minday, maxday, "days")
dates <- dates[sample(1:length(dates),length(dates)/4)] # create some holes
df <- data.frame(date=sort(dates), val=sin(seq(from=0, to=2*pi, length=length(dates))))
To create a time-series with this data I created a 'dummy' dataframe with one row per date and merged that with the existing dataframe:
df <- merge(df, data.frame(date=seq(minday, maxday, "days")), all=T)
This dataframe can be cast into a timeseries. Missing dates are NA.
nts <- ts(df$val, frequency=365, start=c(year(minday), as.numeric(format(minday, "%j"))))
plot(nts)
series <- ts(visitors, frequency=365, start=c(2014, 152))
152 number is 01-06-2014 as it start from 152 number because of frequency=365
To forecast for 60 days, h=60.
forecast(arimadata , h=60)

generate difference between 2 survdiff objects

Is it possible to substract one survdiff object from another one in R, using the survival package?
I want to plot a figure that shows in which intervals one survival curve is higher/lower than the other and by how much.
one possible solution with survA and survB as survdiff-objects:
interval <- 0:2500
# choose a different time interval if you want
sumA <- summary(survA, time = interval)
sumB <- summary(survB, time = interval)
both <- data.frame(time = interval, A = sumA$surv, B = sumB$surv)
both$diff <- both$B - both$A
# or both$diff <- both$A - both$B
plot(x = both$time, y = both$diff, type = "line")

Convert absolute values to ranges for charting in R

Warning: still new to R.
I'm trying to construct some charts (specifically, a bubble chart) in R that shows political donations to a campaign. The idea is that the x-axis will show the amount of contributions, the y-axis the number of contributions, and the area of the circles the total amount contributed at this level.
The data looks like this:
CTRIB_NAML CTRIB_NAMF CTRIB_AMT FILER_ID
John Smith $49 123456789
The FILER_ID field is used to filter the data for a particular candidate.
I've used the following functions to convert this data frame into a bubble chart (thanks to help here and here).
vals<-sort(unique(dfr$CTRIB_AMT))
sums<-tapply( dfr$CTRIB_AMT, dfr$CTRIB_AMT, sum)
counts<-tapply( dfr$CTRIB_AMT, dfr$CTRIB_AMT, length)
symbols(vals,counts, circles=sums, fg="white", bg="red", xlab="Amount of Contribution", ylab="Number of Contributions")
text(vals, counts, sums, cex=0.75)
However, this results in way too many intervals on the x-axis. There are several million records all told, and divided up for some candidates could still result in an overwhelming amount of data. How can I convert the absolute contributions into ranges? For instance, how can I group the vals into ranges, e.g., 0-10, 11-20, 21-30, etc.?
----EDIT----
Following comments, I can convert vals to numeric and then slice into intervals, but I'm not sure then how I combine that back into the bubble chart syntax.
new_vals <- as.numeric(as.character(sub("\\$","",vals)))
new_vals <- cut(new_vals,100)
But regraphing:
symbols(new_vals,counts, circles=sums)
Is nonsensical -- all the values line up at zero on the x-axis.
Now that you've binned vals into a factor with cut, you can just use tapply again to find the counts and the sums using these new breaks. For example:
counts = tapply(dfr$CTRIB_AMT, new_vals, length)
sums = tapply(dfr$CTRIB_AMT, new_vals, sum)
For this type of thing, though, you might find the plyr and ggplot2 packages helpful. Here is a complete reproducible example:
require(ggplot2)
# Options
n = 1000
breaks = 10
# Generate data
set.seed(12345)
CTRIB_NAML = replicate(n, paste(letters[sample(10)], collapse=''))
CTRIB_NAMF = replicate(n, paste(letters[sample(10)], collapse=''))
CTRIB_AMT = paste('$', round(runif(n, 0, 100), 2), sep='')
FILER_ID = replicate(10, paste(as.character((0:9)[sample(9)]), collapse=''))[sample(10, n, replace=T)]
dfr = data.frame(CTRIB_NAML, CTRIB_NAMF, CTRIB_AMT, FILER_ID)
# Format data
dfr$CTRIB_AMT = as.numeric(sub('\\$', '', dfr$CTRIB_AMT))
dfr$CTRIB_AMT_cut = cut(dfr$CTRIB_AMT, breaks)
# Summarize data for plotting
plot_data = ddply(dfr, 'CTRIB_AMT_cut', function(x) data.frame(count=nrow(x), total=sum(x$CTRIB_AMT)))
# Make plot
dev.new(width=4, height=4)
qplot(CTRIB_AMT_cut, count, data=plot_data, geom='point', size=total) + opts(axis.text.x=theme_text(angle=90, hjust=1))

Resources