Forecasting Time Series

library("forecast")

Loading required package: zoo


Attaching package: 'zoo'

The following objects are masked from 'package:base':

    as.Date, as.Date.numeric

Loading required package: timeDate

Loading required package: methods

This is forecast 7.3

Google Series (GOOG)

Daily closing price: May 2, 2005 to December 29, 2006.

data <- read.csv("http://ptrckprry.com/course/forecasting/data/google.csv")
GOOG <- data$google
n <- length(GOOG)
time <- 1:n
plot(time, GOOG, type="l", col=2)

Log GOOG

log.GOOG <- log(GOOG)
plot(time, log.GOOG, type="l", col=2)

Acf(log.GOOG)

Pacf(log.GOOG)

Diff Log GOOG

diff.log.GOOG <- c(NA, diff(log.GOOG))

plot(time, diff.log.GOOG, type="l", col=2)

Acf(diff.log.GOOG)

Pacf(diff.log.GOOG)

2nd Diff Log GOOG

diff2.log.GOOG <- c(NA, diff(diff.log.GOOG))

plot(time, diff2.log.GOOG, type="l", col=2)

Acf(diff2.log.GOOG)

Pacf(diff2.log.GOOG)

ARIMA(0, 1, 0)

fit.00 <- Arima(log.GOOG, c(0, 1, 0), include.constant=FALSE)
print(fit.00)

Series: log.GOOG 
ARIMA(0,1,0)                    

sigma^2 estimated as 0.0004203:  log likelihood=1036.72
AIC=-2071.45   AICc=-2071.44   BIC=-2067.41

fit.00$aicc

[1] -2071.437

AICC for Candidate Models (Buggy Version)

# Without constant:
fit.00 <- Arima(log.GOOG, c(0, 1, 0), include.constant=FALSE)
fit.01 <- Arima(log.GOOG, c(0, 1, 1), include.constant=FALSE)
fit.02 <- Arima(log.GOOG, c(0, 1, 2), include.constant=FALSE)
fit.10 <- Arima(log.GOOG, c(1, 1, 0), include.constant=FALSE)
fit.11 <- Arima(log.GOOG, c(1, 1, 1), include.constant=FALSE)
fit.12 <- Arima(log.GOOG, c(1, 1, 2), include.constant=FALSE)
fit.20 <- Arima(log.GOOG, c(2, 1, 0), include.constant=FALSE)
fit.21 <- Arima(log.GOOG, c(2, 1, 1), include.constant=FALSE)
fit.22 <- Arima(log.GOOG, c(2, 1, 2), include.constant=FALSE)

# With constant:
fit.00c <- Arima(log.GOOG, c(0, 1, 0), include.constant=TRUE)
fit.01c <- Arima(log.GOOG, c(0, 1, 1), include.constant=TRUE)
fit.02c <- Arima(log.GOOG, c(0, 1, 2), include.constant=TRUE)
fit.10c <- Arima(log.GOOG, c(1, 1, 0), include.constant=TRUE)
fit.11c <- Arima(log.GOOG, c(1, 1, 1), include.constant=TRUE)
fit.12c <- Arima(log.GOOG, c(1, 1, 2), include.constant=TRUE)
fit.20c <- Arima(log.GOOG, c(2, 1, 0), include.constant=TRUE)
fit.21c <- Arima(log.GOOG, c(2, 1, 1), include.constant=TRUE)
fit.22c <- Arima(log.GOOG, c(2, 1, 2), include.constant=TRUE)

# Summarize Results
models <- data.frame(p = rep(c(0, 0, 0, 1, 1, 1, 2, 2, 2), 2),
                     d = rep(1, 18),
                     q = rep(c(0, 1, 2), 6),
                     include.constant = c(rep(FALSE, 9), rep(TRUE, 9)),
                     loglik = c(fit.00$loglik, fit.01$loglik, fit.02$loglik,
                                fit.10$loglik, fit.11$loglik, fit.12$loglik,
                                fit.20$loglik, fit.21$loglik, fit.22$loglik,
                                fit.00c$loglik, fit.01c$loglik, fit.02c$loglik,
                                fit.10c$loglik, fit.11c$loglik, fit.12c$loglik,
                                fit.20c$loglik, fit.21c$loglik, fit.22c$loglik),
                     aicc = c(fit.00$aicc, fit.01$aicc, fit.02$aicc,
                                fit.10$aicc, fit.11$aicc, fit.12$aicc,
                                fit.20$aicc, fit.21$aicc, fit.22$aicc,
                                fit.00c$aicc, fit.01c$aicc, fit.02c$aicc,
                                fit.10c$aicc, fit.11c$aicc, fit.12c$aicc,
                                fit.20c$aicc, fit.21c$aicc, fit.22c$aicc)
                     )
print(models, digits=6)

   p d q include.constant  loglik     aicc
1  0 1 0            FALSE 1036.72 -2071.44
2  0 1 1            FALSE 1036.94 -2069.84
3  0 1 2            FALSE 1037.05 -2068.05
4  1 1 0            FALSE 1036.95 -2069.86
5  1 1 1            FALSE 1037.63 -2069.19
6  1 1 2            FALSE 1037.63 -2067.16
7  2 1 0            FALSE 1037.06 -2068.07
8  2 1 1            FALSE 1039.76 -2071.42
9  2 1 2            FALSE 1039.64 -2069.14
10 0 1 0             TRUE 1038.23 -2072.43
11 0 1 1             TRUE 1038.37 -2070.67
12 0 1 2             TRUE 1038.43 -2068.76
13 1 1 0             TRUE 1038.37 -2070.68
14 1 1 1             TRUE 1038.67 -2069.25
15 1 1 2             TRUE 1041.25 -2072.35
16 2 1 0             TRUE 1038.43 -2068.77
17 2 1 1             TRUE 1041.10 -2072.06
18 2 1 2             TRUE 1043.53 -2074.87

These results are unreliable. Use the method described in the next seciton instead.

AICC for Candidate Models (Correct Version)

# Without constant:
fit.00 <- Arima(diff.log.GOOG, c(0, 0, 0), include.constant=FALSE)
fit.01 <- Arima(diff.log.GOOG, c(0, 0, 1), include.constant=FALSE)
fit.02 <- Arima(diff.log.GOOG, c(0, 0, 2), include.constant=FALSE)
fit.10 <- Arima(diff.log.GOOG, c(1, 0, 0), include.constant=FALSE)
fit.11 <- Arima(diff.log.GOOG, c(1, 0, 1), include.constant=FALSE)
fit.12 <- Arima(diff.log.GOOG, c(1, 0, 2), include.constant=FALSE)
fit.20 <- Arima(diff.log.GOOG, c(2, 0, 0), include.constant=FALSE)
fit.21 <- Arima(diff.log.GOOG, c(2, 0, 1), include.constant=FALSE)
fit.22 <- Arima(diff.log.GOOG, c(2, 0, 2), include.constant=FALSE)

# With constant:
fit.00c <- Arima(diff.log.GOOG, c(0, 0, 0), include.constant=TRUE)
fit.01c <- Arima(diff.log.GOOG, c(0, 0, 1), include.constant=TRUE)
fit.02c <- Arima(diff.log.GOOG, c(0, 0, 2), include.constant=TRUE)
fit.10c <- Arima(diff.log.GOOG, c(1, 0, 0), include.constant=TRUE)
fit.11c <- Arima(diff.log.GOOG, c(1, 0, 1), include.constant=TRUE)
fit.12c <- Arima(diff.log.GOOG, c(1, 0, 2), include.constant=TRUE)
fit.20c <- Arima(diff.log.GOOG, c(2, 0, 0), include.constant=TRUE)
fit.21c <- Arima(diff.log.GOOG, c(2, 0, 1), include.constant=TRUE)
fit.22c <- Arima(diff.log.GOOG, c(2, 0, 2), include.constant=TRUE)

# Summarize Results
models <- data.frame(p = rep(c(0, 0, 0, 1, 1, 1, 2, 2, 2), 2),
                     d = rep(1, 18),
                     q = rep(c(0, 1, 2), 6),
                     include.constant = c(rep(FALSE, 9), rep(TRUE, 9)),
                     loglik = c(fit.00$loglik, fit.01$loglik, fit.02$loglik,
                                fit.10$loglik, fit.11$loglik, fit.12$loglik,
                                fit.20$loglik, fit.21$loglik, fit.22$loglik,
                                fit.00c$loglik, fit.01c$loglik, fit.02c$loglik,
                                fit.10c$loglik, fit.11c$loglik, fit.12c$loglik,
                                fit.20c$loglik, fit.21c$loglik, fit.22c$loglik),
                     aicc = c(fit.00$aicc, fit.01$aicc, fit.02$aicc,
                                fit.10$aicc, fit.11$aicc, fit.12$aicc,
                                fit.20$aicc, fit.21$aicc, fit.22$aicc,
                                fit.00c$aicc, fit.01c$aicc, fit.02c$aicc,
                                fit.10c$aicc, fit.11c$aicc, fit.12c$aicc,
                                fit.20c$aicc, fit.21c$aicc, fit.22c$aicc)
                     )
print(models, digits=6)

   p d q include.constant  loglik     aicc
1  0 1 0            FALSE 1036.72 -2071.44
2  0 1 1            FALSE 1036.94 -2069.84
3  0 1 2            FALSE 1037.05 -2068.05
4  1 1 0            FALSE 1036.95 -2069.86
5  1 1 1            FALSE 1037.63 -2069.19
6  1 1 2            FALSE 1039.91 -2071.72
7  2 1 0            FALSE 1037.06 -2068.07
8  2 1 1            FALSE 1039.76 -2071.42
9  2 1 2            FALSE 1039.64 -2069.14
10 0 1 0             TRUE 1038.23 -2072.43
11 0 1 1             TRUE 1038.37 -2070.67
12 0 1 2             TRUE 1038.43 -2068.76
13 1 1 0             TRUE 1038.37 -2070.68
14 1 1 1             TRUE 1038.67 -2069.24
15 1 1 2             TRUE 1041.25 -2072.35
16 2 1 0             TRUE 1038.43 -2068.77
17 2 1 1             TRUE 1041.10 -2072.06
18 2 1 2             TRUE 1040.73 -2069.26

Selected Model

fit.best <- Arima(log.GOOG, c(0, 1, 0), include.constant=TRUE)
print(fit.best)

Series: log.GOOG 
ARIMA(0,1,0) with drift         

Coefficients:
       drift
      0.0017
s.e.  0.0010

sigma^2 estimated as 0.0004183:  log likelihood=1038.23
AIC=-2072.46   AICc=-2072.43   BIC=-2064.38

Note: constant (drift) is not significant. However, the hypothesis tests relies on the model being correct, so the z-statistic and corresponding p-value are probably not reliable. It’s better to use AICC to determine whether to include a constant.

Residuals

resid <- residuals(fit.best)
plot(time, resid, type="l", col=2)

Acf(resid)

Pacf(resid)

Ljung-Box Goodness-of-Fit Test

# fitdf = 1 (p = 0, q = 0, model includes constant)
Box.test(resid, lag=12, type = "Ljung-Box", fitdf=1)


    Box-Ljung test

data:  resid
X-squared = 21.904, df = 11, p-value = 0.02513

Box.test(resid, lag=24, type = "Ljung-Box", fitdf=1)


    Box-Ljung test

data:  resid
X-squared = 27.885, df = 23, p-value = 0.2202

Box.test(resid, lag=36, type = "Ljung-Box", fitdf=1)


    Box-Ljung test

data:  resid
X-squared = 40.881, df = 35, p-value = 0.2279

Box.test(resid, lag=48, type = "Ljung-Box", fitdf=1)


    Box-Ljung test

data:  resid
X-squared = 54.659, df = 47, p-value = 0.2065

Forecasts From Selected Model

forecast(fit.best, h=10, level=95)

    Point Forecast    Lo 95    Hi 95
422       6.134003 6.093917 6.174090
423       6.135737 6.079047 6.192428
424       6.137471 6.068040 6.206903
425       6.139205 6.059033 6.219378
426       6.140940 6.051303 6.230576
427       6.142674 6.044482 6.240865
428       6.144408 6.038349 6.250466
429       6.146142 6.032760 6.259523
430       6.147876 6.027616 6.268135
431       6.149610 6.022845 6.276374

plot(forecast(fit.best, h=100, level=95), col=2)

Forecasts From Model Without Constant

fit.00 <- Arima(log.GOOG, c(0, 1, 0), include.constant=FALSE)
plot(forecast(fit.00, h=100, level=95), col=2)

Don’t Use p-values for Model Selection

fit.11 <- Arima(log.GOOG, c(1, 1, 1), include.constant=FALSE)
print(fit.11)

Series: log.GOOG 
ARIMA(1,1,1)                    

Coefficients:
         ar1      ma1
      0.8953  -0.8659
s.e.  0.1402   0.1571

sigma^2 estimated as 0.0004205:  log likelihood=1037.63
AIC=-2069.25   AICc=-2069.19   BIC=-2057.13

(zstat.ar1 <- 0.895 / 0.140)

[1] 6.392857

(pval.ar1 <- 2*pnorm(-abs(zstat.ar1)))

[1] 1.628144e-10

(zstat.ma1 <- -0.866 / 0.157)

[1] -5.515924

(pval.ma1 <- 2*pnorm(-abs(zstat.ma1)))

[1] 3.46953e-08