Predict out of sample using flexsurvreg in R - r

I have the following model in R
library(flexsurv)
data(ovarian)
model = flexsurvreg(Surv(futime, fustat) ~ ecog.ps + rx, data = ovarian, dist='weibull')
model
predict(model,data = ovarian, type = 'response')
The model summary looks like this flexsurvreg model output
I am trying to predict the survival time using the predict function in R and get the following error
error while trying to predict
How can I predict expected lifetime using this flexsurvreg model?
I understand that the documentation mentions a totlos.fs function, but this data does not seem to have a trans variable that totlos.fs requires to provide an output.
If there is no other alternative to totlos.fs how can I create a trans variable in this data and handle it along with existing covariates?
Please advise.

Section 3 of the supplementary examples doc for the flexsurv documentation has an example in which the predicted values are calculated directly using the model equation. As you are using the Weibull distribution (with n=2 parameters) I believe this should work:
pred.model <- model.matrix(model) %*% model$res[-(1:n),"est"]
Cheers

Nik,
I know your question is an old one, but see below how I hacked a way to do it. It involves retrieving the shape and rate parameters from your fit of test data, then instead of predict, you use the qgompertz() from flexsurv. Please excuse the use of my own encapsulated example code, but you should be able to follow along.
# generate the training data "lung1" from data(lung) in survival package
# hacked way for truncating the lung data to 2 years of follow up
require(survival)
lung$yrs <- lung$time/365
lung1 <- lung[c("status", "yrs")]
lung1$status[ lung1$yrs >2] <- 1
lung1$yrs[ lung1$yrs >2] <- 2
# from the training data build KM to obtain survival %s
s <- Surv(time=lung1$yrs, event=lung1$status)
km.lung <- survfit(s ~ 1, data=lung1)
plot(km.lung)
# generate dataframe to use later for plotting
cut.length <- sum((km.lung$time <= 2)) # so I can create example test data
test.data <- data.frame(yrs = km.lung$time[1:cut.length] , surv=round(km.lung$surv[1:cut.length], 3))
##
## doing the same as above with gompertz
##
require(flexsurv) #needed to run gompertz model
s <- Surv(time=lung1$yrs, event=lung1$status)
gomp <- flexsurvreg(s ~ 1, data=lung1, dist="gompertz") # run this to get shape and rate estimates for gompertz
gomp # notice the shape and rate values
# create variables for these values
g.shape <- 0.5866
g.rate <- 0.5816
##
## plot data and vizualize the gomperts
##
# vars for plotting
df1 <- test.data
xvar <- "yrs"
yvar <- "surv"
extendedtime <- 3 #
ylim1 <- c(0,1)
xlim1 <- c(0, extendedtime)
# plot the survival % for training data
plot(df1[,yvar]~df1[,xvar], type="S", ylab="", xlab="", lwd=3, xlim=xlim1, ylim=ylim1)
# Nik--here is where the magic happens... pay special attention to: qgompertz(seq(.01,.99,by=.01), shape=0.58656, rate = .5816)
lines (qgompertz(seq(.01,.99,by=.01), shape=0.58656, rate = .5816) , seq(.99,.01,by=-.01) , col="red", lwd=2, lty=2 )
# generate a km curve from the testing data
s <- Surv(time=lung$yrs, event=lung$status)
km.lung <- survfit(s ~ 1, data=lung)
par(new=T)
# now draw remaining survival curve from the testing section
plot(km.lung$surv[(cut.length+1):length(km.lung$time)]~km.lung$time[(cut.length+1):length(km.lung$time)], type="S", col="blue", ylab="", xlab="", lwd=3, xlim=xlim1, ylim=ylim1)

Related

ARIMA loop in R

I'm pretty new to R and I've run into a problem with finding the optimal ARIMA model. So far I've modeled the trend and a seasonal component, and now I want to model the cyclical component with an ARIMA model. I want the output in the end to include coefficients for the time variable, the seasonal variables and also the ARIMA variables. I've tried to use a loop to find the optimal ARIMA model and the coefficients, but I just get this message:
"Error in optim(init[mask], armaCSS, method = optim.method, hessian = FALSE, :
non-finite value supplied by optim"
I've tried looking for other answers in here, but I just can't seem to figure out what I'm doing wrong.
I've included the entire code in case it is necessary, but the error appears after running the loop in the end.
I appreciate any help I can get, thank you!
#clear workspace
rm(list=ls())
#load data
setwd("~/Desktop/CBS/HA almen year 3 /Forecasting /R koder ")
data <- scan("onlineretail.txt")
data <- data[2:69] #cut off first period + two last periods for whole years
T=length(data)
s=4
years=T/s
styear=2000
st=c(styear,1)
data = ts(data,start=st, frequency = s)
plot(data)
summary(data)
#plot shows increasing variance - log transform data
lndata <- log(data)
plot(lndata)
dataTSE = decompose(lndata, type="additive")
plot(dataTSE)
########### Trend ##########
t=(1:T)
t2=t^2
lny <- lndata
lmtrend.model <- lm(lny~t)
summary(lmtrend.model)
#linear trend T_t = 8,97 + 0,039533*TIME - both coefficeients significant
#Project 2, explanation why linear is better than quadratic
qtrend.model <- lm(lny~t+t2)
summary(qtrend.model)
lntrend = fitted(lmtrend.model)
lntrend = ts(lntrend, start=st, frequency = s)
#lntrend2 = fitted(qtrend.model)
#lntrend2 = ts(lntrend2, start=st, frequency = s)
residuals=lny-lntrend
par(mar=c(5,5,5,5))
plot(lny, ylim=c(5,12), main="Log e-commerce retail sales")
lines(lntrend, col="blue")
#lines(lntrend2, col="red")
par(new=T)
plot(residuals,ylim=c(-0.2,0.8),ylab="", axes=F)
axis(4, pretty(c(-0.2,0.4)))
abline(h=0, col="grey")
mtext("Residuals", side=4, line=2.5, at=0)
############# Season #################
#The ACF of the residuals confirms the neglected seasonality, because there
#is a clear pattern for every k+4 lags:
acf(residuals)
#Remove trend to observe seasonal factors without the trend:
detrended = residuals
plot(detrended, ylab="ln sales", main="Seasonality in ecommerce retail sales")
abline(h=0, col="grey")
#We can check out the average magnitude of seasonal factors
seasonal.matrix=matrix(detrended, ncol=s, byrow=years)
SeasonalFactor = apply(seasonal.matrix, 2, mean)
SeasonalFactor=ts(SeasonalFactor, frequency = s)
SeasonalFactor
plot(SeasonalFactor);abline(h=0, col="grey")
#We add seasonal dummies to our model of trend and omit the last quarter
library("forecast")
M <- seasonaldummy(lny)
ST.model <- lm(lny ~ t+M)
summary(ST.model)
#ST.model <- tslm(lny~t+season)
#summary(ST.model)
#Both the trend and seasonal dummies appears highly significant
#We will use a Durbin-Watson test to detect serial correlation
library("lmtest")
dwtest(ST.model)
#The DW value is 0.076396. This is quite small, as the value should be around
2
#and we should therefore try to improve the model with a cyclical component
#I will construct a plot that shows how the model fits the data and
#how the residuals look
lntrend=fitted(ST.model)
lntrend = ts(lntrend, start=st, frequency = s)
residuals=lny-lntrend
par(mar=c(5,5,5,5))
plot(lny, ylim=c(5,12), main="Log e-commerce retail sales")
lines(lntrend, col="blue")
#tell R to draw over the current plot with a new one
par(new=T)
plot(residuals,ylim=c(-0.2,0.8),ylab="", axes=F)
axis(4, pretty(c(-0.2,0.4)))
abline(h=0, col="grey")
mtext("Residuals", side=4, line=2.5, at=0)
############## Test for unit root ############
#We will check if the data is stationary, and to do so we will
#test for unit root.
#To do so, we will perform a Dickey-Fuller test. First, we have to remove
seasonal component.
#We can also perform an informal test with ACF and PACF
#the autocorrelation function shows that the data damps slowly
#while the PACF is close to 1 at lag 1 and then lags become insignificant
#this is informal evidence of unit root
acf(residuals)
pacf(residuals)
#Detrended and deseasonalized data
deseason = residuals
plot(deseason)
#level changes a lot over time, not stationary in mean
#Dickey-Fuller test
require(urca)
test <- ur.df(deseason, type = c("trend"), lags=3, selectlags = "AIC")
summary(test)
#We do not reject that there is a unit root if
# |test statistics| < |critical value|
# 1,97 < 4,04
#We can see from the output that the absolute value of the test statistics
#is smaller than the critical value. Therefore, there is no evidence against
the unit root.
#We check the ACF and PACF in first differences. There should be no
significant lags
#if the data is white noise in first differences.
acf(diff(deseason))
pacf(diff(deseason))
deseasondiff = diff(deseason, differences = 2)
plot(deseasondiff)
test2 <- ur.df(deseasondiff, type=c("trend"), lags = 3, selectlags = "AIC")
summary(test2)
#From the plot and the Dickey-Fuller test, it looks like we need to difference
twice
############# ARIMA model ############
S1 = rep(c(1,0,0,0), T/s)
S2 = rep(c(0,1,0,0), T/s)
S3 = rep(c(0,0,1,0), T/s)
TrSeas = model.matrix(~ t+S1+S2+S3)
#Double loop for finding the best fitting ARIMA model and since there was
#a drift, we include this in the model
best.order <- c(0, 2, 0)
best.aic <- Inf
for (q in 1:6) for (p in 1:6) {
fit.aic <- AIC(arima(lny,order = c(p,2, q),include.mean = TRUE,xreg=TrSeas))
print(c(p,q,fit.aic))
if (fit.aic < best.aic) {
best.order <- c(p, 0, q)
best.arma <- arima(lny,order = c(p, 2, q),include.mean = TRUE,xreg=TrSeas)
best.aic <- fit.aic
}
}
best.order
Please use the forecast package from Prof. Hyndman.
The call to:
auto.arima(data)
will return you the most optimal ARIMA model for your time series. You will find https://www.otexts.org/fpp/8/7 a great reference as well.

Add raw data points to jp.int (sjPlot)

For my manuscript, I plotted a lme with an interaction of two continuous variables:
Create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
Run the model:
model <- lme(HCD ~ age*time+sex*time+Vol*time, random=~time|SID, data=mydata)
Make plot:
sjp.int(model, swap.pred=T, show.ci=T, mdrt.values="meansd")
The reviewer now wants me to add the raw data points to this plot. How can I do this? I tried adding geom_point() referring to mydata, but that is not possible.
Any ideas?
Update:
I thought that maybe I could extract the random slope of HCD and then residuals HCD for the covariates and also residuals Vol for the covariates and plot those two to make things easier (then I could plot the points in a 2D plot).
So, I tried to extract the slopes and use these to fit a linear regression, but the results are different (in the reproducible example less significant, but in my data: the interaction became non-significant (and was significant in the lme)). Not sure what that means or whether this just shows that I should not try to plot it this way.
get the slopes:
model <- lme(HCD ~ time, random=~time|SID, data=mydata)
slopes <- rbind(row.names(model$coefficients$random$SID), model$coef$random$SID[,2])
slopes2 <- data.frame(matrix(unlist(slopes), nrow=144, byrow=T))
names(slopes2)[1] <- "SID"
names(slopes2)[2] <- "slopes"
(save the slopes2 and reopen, because somehow R sees it as a factor)
Then create a cross-sectional dataframe and merge the slopes:
mydata$time2 <- round(mydata$time)
new <- reshape(mydata,idvar = "SID", timevar="time2", direction="wide")
newdata <- dplyr::left_join(new, slop, by="SID")
The lm:
modelw <- lm(slop$slopes ~ age.1+sex.1+Vol.1, data=newdata)
Vol now has a p-value of 0.8 (previously this was 0.14)

Predict Future values using polynomial regression in R

Was trying to predict the future value of a sample using polynomial regression in R. The y values within the sample forms a wave pattern.
For example
x = 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
y= 1,2,3,4,5,4,3,2,1,0,1,2,3,4,5,4
But when the graph is plotted for future values the resultant y values was completely different from what was expected. Instead of a wave pattern, was getting a graph where the y values keep increasing.
futurY = 17,18,19,20,21,22
Tried different degrees of polynomial regression, but the predicted results for futurY were drastically different from what was expected
Following is the sample R code which was used to get the results
dfram <- data.frame('x'=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16))
dfram$y <- c(1,2,3,4,5,4,3,2,1,0,1,2,3,4,5,4)
plot(dfram,dfram$y,type="l", lwd=3)
pred <- data.frame('x'=c(17,18,19,20,21,22))
myFit <- lm(y ~ poly(x,5), data=dfram)
newdata <- predict(myFit, pred)
print(newdata)
plot(pred[,1],data.frame(newdata)[,1],type="l",col="red", lwd=3)
Is this the correct technique to be used for predicting the unknown future y values OR should I be using other techniques like forecasting?
# Reproducing your data frame
dfram <- data.frame("x" = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16),
"y" = c(1,2,3,4,5,4,3,2,1,0,1,2,3,4,5,4))
From your graph I've got the phase and period of the signal. There're better ways of calculating that automatically.
# Phase and period
fase = 1
per = 10
In the linear model function I've put the triangular signal equations.
fit <- lm(y ~ I((((trunc((x-fase)/(per/2))%%2)*2)-1) * (x-fase)%%(per/2))
+ I((((trunc((x-fase)/(per/2))%%2)*2)-1) * ((per/2)-((x-fase)%%(per/2))))
,data=dfram)
# Predict the old data
p_olddata <- predict(fit,type="response")
# Predict the new data
newdata <- data.frame('x'=c(17,18,19,20,21,22))
p_newdata <- predict(fit,newdata,type="response")
# Ploting Old and new data
plot(x=c(dfram$x,newdata$x),
y=c(p_olddata,p_newdata),
col=c(rep("blue",length(p_olddata)),rep("green",length(p_olddata))),
xlab="x",
ylab="y")
lines(dfram)
Where the black line is the original signal, the blue circles are the prediction for the original points and the green circles are the prediction for the new data.
The graph shows a perfect fit for the model because there's no noise in the data. In a real dataset you may find it so the fit will not look as nice as that.

How to compare ARIMA model in R to actual observations used to create the model?

I've been using the R forecast package's auto.arima() function to fit an ARIMA model to my time series data. I want to see how good of a fit the ARIMA model is to my original data. I hope to plot my original time series and the ARIMA simulation on the same plot and see how well they match up. How can I do this?
I'm not really sure about what you are looking for, but I think you want to have some indication about the accuracy of your model.
You may found details on the topic in the online book of Hyndman and Athanasopoulos: https://www.otexts.org/fpp/2/5
Here an example based on the dataset "AirPassengers"
library(forecast)
data(AirPassengers)
# inspect the series
AirPassengers
plot(AirPassengers)
# data are assigned to a convenient vector
series <- AirPassengers
# the accuracy of forecasts can only be determined
# by considering how well a model performs on new data that were not used when fitting the model.
# The size of the test set is typically about 20% of the total sample
# training set
# use data from 1949 to 1956 for forecasting
sr = window(series, start=1949, end=c(1956,12))
# test set
# use remaining data from 1957 to 1960 to test accuracy
ser = window(series, start=1957, end=c(1960,12))
######################################################################
# plot training set
######################################################################
plot(sr, main="AirPassengers", ylab="", xlab="Months")
# plot forecasting for 5 years according to four methods
lines(meanf(sr,h=48)$mean, col=4)
lines(rwf(sr,h=48)$mean, col=2)
lines(rwf(sr,drift=TRUE,h=48)$mean, col=3)
lines(snaive(sr,h=48)$mean, col=5)
# legend
legend("topleft", lty=1, col=c(4,2,3, 5),
legend=c("Mean method","Naive method","Drift method", "Seasonal naïve method"),bty="n")
# the test set
lines(ser, col="red")
# accuracy for forecasting of sr (forecasted data) on ser (original data)
# the best model had the lowest error (particularly the MAPE, Mean absolute percentage error)
# Mean method
accuracy(meanf(sr,h=48), ser)
# Naive method
accuracy(rwf(sr,h=48), ser)
# Drift method
accuracy(rwf(sr,drift=TRUE,h=48), ser)
# Seasonal naïve method
accuracy(snaive(sr,h=48), ser)
######################################################################
# plot test set only with the predictions
######################################################################
# calculate the forecasting
sr.mean <- meanf(sr,h=48)$mean
sr.naive <- rwf(sr,h=48)$mean
sr.drift <- rwf(sr,drift=TRUE,h=48)$mean
sr.seas <- snaive(sr,h=48)$mean
# plot the test set
plot(ser, main="AirPassengers", ylab="", xlab="Months", ylim = c(200,600))
# plot forecasting for 4 years according to four methods
lines(sr.mean, col=4)
lines(sr.naive, col=2)
lines(sr.drift, col=3)
lines(sr.seas, col=5)
# legend
legend("topleft", lty=1, col=c(4,2,3,5),
legend=c("Mean method","Naive method","Drift method", "Seasonal naïve method"),bty="n")
########################################################################
# for ARIMA; Hyndman suggest to use auto-arima without stepwise
########################################################################
library(fpp)
trainData <- sr
testData <- ser
# the default value in auto.arima() is test="kpss".
# A KPSS test has a null hypothesis of stationarity
# In general, all the defaults are set to the values that give the best forecasts on average.
# CAUTION! Takes a while to compute
arimaMod <- auto.arima(trainData, stepwise=FALSE, approximation=FALSE)
arimaMod.Fr <-forecast(arimaMod,h=48)
# plot of the prediction and of the test set
plot(arimaMod.Fr)
lines(testData, col="red")
legend("topleft",lty=1,bty = "n",col=c("red","blue"),c("testData","ARIMAPred"))
# plot of the test set and its prediction only
AR.mean <-forecast(arimaMod,h=48)$mean
plot(testData, main="AirPassengers", ylab="", xlab="Months", col="darkblue")
lines(AR.mean, col="red")
# accuracy
accuracy(arimaMod.Fr,testData)
# test residues of arima
tsdisplay(residuals(arimaMod))

How to add RMSE, slope, intercept, r^2 to R plot?

How can I add RMSE, slope, intercept and r^2 to a plot using R? I have attached a script with sample data, which is a similar format to my real dataset--unfortunately, I am at a stand-still. Is there an easier way to add these statistics to the graph than to create an object from an equation and insert that into text()? I would ideally like the statistics to be displayed stacked on the graph. How can I accomplish this?
## Generate Sample Data
x = c(2,4,6,8,9,4,5,7,8,9,10)
y = c(4,7,6,5,8,9,5,6,7,9,10)
# Create a dataframe to resemble existing data
mydata = data.frame(x,y)
#Plot the data
plot(mydata$x,mydata$y)
abline(fit <- lm(y~x))
# Calculate RMSE
model = sqrt(deviance(fit)/df.residual(fit))
# Add RMSE value to plot
text(3,9,model)
Here is a version using base graphics and ?plotmath to draw the plot and annotate it
## Generate Sample Data
x = c(2,4,6,8,9,4,5,7,8,9,10)
y = c(4,7,6,5,8,9,5,6,7,9,10)
## Create a dataframe to resemble existing data
mydata = data.frame(x,y)
## fit model
fit <- lm(y~x, data = mydata)
Next calculate the values you want to appear in the annotation. I prefer bquote() for this, where anything marked-up in .(foo) will be replaced by the value of the object foo. The Answer #mnel points you to in the comments uses substitute() to achieve the same thing but via different means. So I create objects in the workspace for each value you might wish to display in the annotation:
## Calculate RMSE and other values
rmse <- round(sqrt(mean(resid(fit)^2)), 2)
coefs <- coef(fit)
b0 <- round(coefs[1], 2)
b1 <- round(coefs[2],2)
r2 <- round(summary(fit)$r.squared, 2)
Now build up the equation using constructs described in ?plotmath:
eqn <- bquote(italic(y) == .(b0) + .(b1)*italic(x) * "," ~~
r^2 == .(r2) * "," ~~ RMSE == .(rmse))
Once that is done you can draw the plot and annotate it with your expression
## Plot the data
plot(y ~ x, data = mydata)
abline(fit)
text(2, 10, eqn, pos = 4)
Which gives:

Resources