9.2 Exercise 11

Choose one of the following seasonal time series: the Australian production (aus_production) of:

  • electricity
  • cement
  • gas
aus_production%>%
  select(Quarter,Electricity) %>%
  head()
## # A tsibble: 6 x 2 [1Q]
##   Quarter Electricity
##     <qtr>       <dbl>
## 1 1956 Q1        3923
## 2 1956 Q2        4436
## 3 1956 Q3        4806
## 4 1956 Q4        4418
## 5 1957 Q1        4339
## 6 1957 Q2        4811
aus_production%>%
  select(Quarter,Electricity) %>%
  pull(Quarter)%>%
  range()
## <yearquarter[2]>
## [1] "1956 Q1" "2010 Q2"
## # Year starts on: January

9.2.1 Do the data need transforming?

If so, find a suitable transformation.

aus_production%>%
  select(Quarter,Electricity) %>% 
     drop_na() %>% 
     autoplot() + 
     geom_smooth(method = 'loess', se = FALSE, color = 'steelblue') + 
     labs(x = "Quarter", 
          y = 'Electricity in kWh (billion)', 
          title = 'Australian quarterly electricity production (1956 Q1 - 2010 Q2)')

 aus_production%>%
  select(Quarter,Electricity) %>%
  gg_season(Electricity, period = "year")+
  theme(legend.position = "none") +
  labs(y="kWh (billion)", title="Australian Electricity production by year (1956-2010)")

aus_production%>%
  select(Quarter,Electricity) %>%
  filter(year(Quarter) %in%c(1960,2000)) %>%
  mutate(year=year(Quarter))%>%
  ggplot(aes(Quarter,Electricity,group=year,color=factor(year)))+
  geom_line() +
  facet_wrap(vars(year),scales = "free_x") +
  labs(y="kWh (billion)", title="Australian Electricity production",color="Year")

Classical decomposition methods assume that the seasonal component repeats from year to year.

For many series, this is a reasonable assumption, but for some longer series it is not.

For example, electricity demand patterns have changed over time as air conditioning has become more widespread. In many locations, the seasonal usage pattern from several decades ago had its maximum demand in winter (due to heating), while the current seasonal pattern has its maximum demand in summer (due to air conditioning).

Classical decomposition methods are unable to capture these seasonal changes over time.

9.2.2 Are the data stationary?

If not, find an appropriate differencing which yields stationary data.

First step is to apply the difference() function. We can see that there is some stationarity in the data, but we need to apply further modifications in order to conform it.

 aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity)) %>% 
     drop_na() %>% 
     autoplot() + 
     geom_smooth(method = 'loess', se = FALSE, color = 'steelblue') + 
     labs(x = "Quarter", 
          y = 'Electricity in kWh (billion)', 
          title = 'Australian quarterly electricity production (1956 Q1 - 2010 Q2)')

Let’s check the Autocorrelation with the ACF() function, to see whether some lags appear in the data. We do this in the orginal/observed data and in the differencing data.

p1 <- aus_production%>%
  select(Quarter,Electricity) %>% 
     drop_na() %>%
  ACF(Electricity) %>%
  autoplot()
 p2 <- aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity)) %>% 
     drop_na() %>%
  ACF(Electricity) %>%
  autoplot()
library(patchwork)
p1|p2

Lags appear to be very evident. Also, applying ljung_box option in the features() function, with a lag of 4.

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity)) %>% 
  drop_na() %>%
  features(Electricity, ljung_box, lag = 4)
## # A tibble: 1 × 2
##   lb_stat lb_pvalue
##     <dbl>     <dbl>
## 1    227.         0

So, we apply a differencing of 4:

`Electricity = difference(Electricity, differences = 4)`
aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity,differences = 4)) %>% 
     drop_na() %>% 
     autoplot() + 
     geom_smooth(method = 'loess', se = FALSE, color = 'steelblue') + 
     labs(x = "Quarter", 
          y = 'Electricity in kWh (billion)', 
          title = 'Australian quarterly electricity production (1956 Q1 - 2010 Q2)')

And then, again a difeerence of the differencing of 4:

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity,differences = 4),
         Electricity = difference(Electricity)) %>% 
     drop_na() %>% 
     autoplot() + 
     geom_smooth(method = 'loess', se = FALSE, color = 'steelblue') + 
     labs(x = "Quarter", 
          y = 'Electricity in kWh (billion)', 
          title = 'Australian quarterly electricity production (1956 Q1 - 2010 Q2)')

Essentially it is not random, which means that it is correlated with that of previous days.

What happens if we apply the log tranformation as well as previous differencing? We can clearly see how the variance in the data changes:

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = log(Electricity),
         Electricity = difference(Electricity,differences = 4),
         Electricity = difference(Electricity)) %>% 
     drop_na() %>% 
     autoplot() + 
     geom_smooth(method = 'loess', se = FALSE, color = 'steelblue') + 
     labs(x = "Quarter", 
          y = 'Electricity in kWh (billion)', 
          title = 'Australian quarterly electricity production (1956 Q1 - 2010 Q2)')

And, perform the autocorrelation checks:

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = log(Electricity),
         Electricity = difference(Electricity,differences = 4),
         Electricity = difference(Electricity)) %>% 
  drop_na() %>%
  ACF(Electricity) %>%
  autoplot()

We still see many lags, to have a clearer vision of the applied transformation. In this visualization the last plot shows the best transformation to data:

 `Doubly differenced log Electricity` = difference(difference(log(Electricity), 4), 1)
aus_production%>%
  select(Quarter,Electricity) %>% 
 transmute(
    `kWt (biillion)` = Electricity,
    `Log Electricity` = log(Electricity),
    `Annual change in log Electricity` = difference(log(Electricity), 4),
    `Doubly differenced log Electricity` = difference(difference(log(Electricity), 4), 1)
  ) %>%
  pivot_longer(-Quarter, names_to="Type", values_to="Electricity") |>
  mutate(
    Type = factor(Type, levels = c(
      "kWt (biillion)",
      "Log Electricity",
      "Annual change in log Electricity",
      "Doubly differenced log Electricity"))
  ) |>
  ggplot(aes(x = Quarter, y = Electricity)) +
  geom_line() +
  geom_smooth(linewidth=0.3,se=F,color="steelblue")+
  facet_grid(vars(Type), scales = "free_y") +
  labs(title = "Electricity", y = NULL)

Finally, the application of the ACF() to this last transformation let’s us confirming the 4 lags option. To determine more objectively whether differencing is required is to use a unit root test.

In this test, the null hypothesis is that the data are stationary, and we look for evidence that the null hypothesis is false.

So, we apply the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test, or KPSS test p-value.

aus_production%>%
  select(Quarter,Electricity) %>%
  mutate(Electricity= difference(difference(log(Electricity), 4), 1)) %>%
  drop_na() %>%
  ACF() %>%
  autoplot()

The KPSS test p-value is reported as a number between 0.01 and 0.1.

If we use it on observed data the result shows that the null hypothesis would need to be rejected, and the data would be not stationary.

What’s happen if we apply our previous transformations?

aus_production%>%
  select(Quarter,Electricity) %>% 
features(Electricity, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      4.46        0.01

Differencing data appear stationary

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity)) %>% 
features(Electricity, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0591         0.1
aus_production%>%
  select(Quarter,Electricity) %>%
  mutate(Electricity= difference(difference(log(Electricity), 4), 1)) %>%
  drop_na() %>%
  features(Electricity, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0111         0.1

Further checks can be done with unitroot_ndiffs which in this case is 1, indicating that one seasonal difference is required.

aus_production%>%
  select(Quarter,Electricity) %>% 
  features(Electricity, unitroot_ndiffs)
## # A tibble: 1 × 1
##   ndiffs
##    <int>
## 1      1

A further look at how the differencing changes, if we consider the log() and the period differencing. Four Quarters, or 4 periods are 1 year.

aus_production %>%
  select(Quarter, Electricity)  |>
  mutate(Electricity=log(Electricity) %>% difference(4)) |>
  autoplot()

What if we use 8 periods?

aus_production %>%
  select(Quarter, Electricity)  |> 
  mutate(Electricity=log(Electricity) %>% difference(8)) |>
  autoplot()

Just a remainder of what happen in 1 period.

aus_production %>%
  select(Quarter, Electricity)  |>
  mutate(Electricity=log(Electricity) %>% difference(1)) |>
  autoplot()

9.2.3 Which of the models is the best according to their AIC values?

Identify a couple of ARIMA models that might be useful in describing the time series.

ARIMA(p,d,q) = autoregressive AR(p) + differencing d + moving average MA(q)

ARIMA(1,0,1) model - Auto Arima

fit <- aus_production%>%
  select(Quarter,Electricity) %>%  
  model(ARIMA(Electricity))
report(fit)
## Series: Electricity 
## Model: ARIMA(1,0,1)(1,1,2)[4] w/ drift 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1    sma2  constant
##       0.9163  -0.4416  0.8623  -1.6919  0.7736   10.5956
## s.e.  0.0467   0.0942  0.1434   0.1436  0.0960    2.1097
## 
## sigma^2 estimated as 508179:  log likelihood=-1708.84
## AIC=3431.68   AICc=3432.22   BIC=3455.24

Let’s apply the partial autocorrealtion with the PACF() function:

aus_production%>%
  select(Quarter,Electricity) %>% 
  mutate(Electricity = difference(Electricity)) %>% 
     drop_na() %>%
  PACF(Electricity) |>
  autoplot()

To see ACF() and PACF() together we can use gg_tsdisplay()

aus_production %>%
  select(Quarter, Electricity)  |>
  gg_tsdisplay(difference(Electricity), plot_type='partial')

ARIMA(4,0,0) model

And then try with arima with 4 differencing.

model(ARIMA(Electricity ~ pdq(4,0,0)))
fit2 <- aus_production %>%
  select(Quarter, Electricity) %>%
  model(ARIMA(Electricity ~ pdq(4, 0, 0)))

report(fit2)
## Series: Electricity 
## Model: ARIMA(4,0,0)(0,1,2)[4] w/ drift 
## 
## Coefficients:
##          ar1     ar2     ar3     ar4     sma1    sma2  constant
##       0.4589  0.1815  0.0539  0.2190  -0.9733  0.2480   82.5476
## s.e.  0.0690  0.0780  0.0743  0.0951   0.1236  0.1077   13.7643
## 
## sigma^2 estimated as 533342:  log likelihood=-1712.56
## AIC=3441.13   AICc=3441.83   BIC=3468.06

Results of the two models:

Model: ARIMA(1,0,1)(1,1,2)[4] w/ drift - AICc = 3432.22 the best Model: ARIMA(4,0,0)(0,1,2)[4] w/ drift - AICc = 3441.83

9.2.4 Do the residuals resemble white noise?

Estimate the parameters of your best model and do diagnostic testing on the residuals. If not, try to find another ARIMA model which fits better.

ARIMA(1,1,1) model

Let’s check another arima model with one:

 model(ARIMA(Electricity ~ pdq(1,1,1)))
 
 
fit3 <- aus_production%>%
  select(Quarter,Electricity) %>%  
 model(ARIMA(Electricity ~ pdq(1,1,1)))

report(fit3)
## Series: Electricity 
## Model: ARIMA(1,1,1)(1,1,2)[4] 
## 
## Coefficients:
##          ar1      ma1    sar1     sma1    sma2
##       0.2454  -0.7394  0.8927  -1.7320  0.7908
## s.e.  0.1366   0.1071  0.0851   0.0941  0.0752
## 
## sigma^2 estimated as 512273:  log likelihood=-1703.01
## AIC=3418.03   AICc=3418.43   BIC=3438.19

Here we compare all models together:

compare_fit <- aus_production%>% 
  select(Quarter,Electricity) |>
  model(arima400 = ARIMA(Electricity ~ pdq(4,0,0)),
        arima210 = ARIMA(Electricity ~ pdq(2,1,0)),
        arima111 = ARIMA(Electricity ~ pdq(1,1,1)),
        search = ARIMA(Electricity, stepwise=FALSE))

compare_fit |> pivot_longer(cols=everything(),
                            names_to = "Model name",
                            values_to = "Orders")
## # A mable: 4 x 2
## # Key:     Model name [4]
##   `Model name`                            Orders
##   <chr>                                  <model>
## 1 arima400     <ARIMA(4,0,0)(0,1,2)[4] w/ drift>
## 2 arima210              <ARIMA(2,1,0)(1,1,2)[4]>
## 3 arima111              <ARIMA(1,1,1)(1,1,2)[4]>
## 4 search       <ARIMA(1,0,1)(1,1,2)[4] w/ drift>
glance(compare_fit) |> arrange(AICc) |> select(.model:BIC)
## # A tibble: 4 × 6
##   .model    sigma2 log_lik   AIC  AICc   BIC
##   <chr>      <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 arima111 512273.  -1703. 3418. 3418. 3438.
## 2 arima210 538128.  -1708. 3427. 3428. 3447.
## 3 search   508179.  -1709. 3432. 3432. 3455.
## 4 arima400 533342.  -1713. 3441. 3442. 3468.
compare_fit |>
  select(arima111) |>
  gg_tsresiduals()

augment(compare_fit) |>
  filter(.model=='arima111') |>
  features(.innov, ljung_box, lag = 4, dof = 1)
## # A tibble: 1 × 3
##   .model   lb_stat lb_pvalue
##   <chr>      <dbl>     <dbl>
## 1 arima111    1.68     0.641
compare_fit |>
  forecast(h=6) |>
  filter(.model=='arima111') |>
  autoplot(aus_production)

gg_arma(compare_fit |> select(arima111))

This last gg_tsdisplay() testes some other parameter values.

aus_production%>% 
  select(Quarter,Electricity) |>
  gg_tsdisplay(difference(log(Electricity), 12),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")

aus_production%>% 
  select(Quarter,Electricity) |>
  gg_tsdisplay(difference(log(Electricity), 12)|> difference(),
               plot_type='partial', lag=36) +
  labs(title="Double differenced", y="")

fit5 <- aus_production%>% 
  select(Quarter,Electricity) |>
  model(
    arima012011 = ARIMA(Electricity ~ pdq(0,1,2) + PDQ(0,1,1)),
    arima210011 = ARIMA(Electricity ~ pdq(2,1,0) + PDQ(0,1,1)),
    auto = ARIMA(Electricity, stepwise = FALSE, approx = FALSE)
  )
fit5 |> pivot_longer(everything(), 
                     names_to = "Model name",
                     values_to = "Orders")
## # A mable: 3 x 2
## # Key:     Model name [3]
##   `Model name`                            Orders
##   <chr>                                  <model>
## 1 arima012011           <ARIMA(0,1,2)(0,1,1)[4]>
## 2 arima210011           <ARIMA(2,1,0)(0,1,1)[4]>
## 3 auto         <ARIMA(1,0,1)(2,1,2)[4] w/ drift>

9.2.5 Compare the forecasts obtained using ETS()

Forecast the next 24 months of data using your preferred model.

observed_data <- aus_production%>% select(Quarter,Electricity)

forecast_data <- fit %>% 
  forecast(h=8) %>%
  as.data.frame()%>%
  select(Quarter,Electricity='.mean')%>%
  mutate(model="Arima101") %>%
  rbind(fit3 %>% 
  forecast(h=8) %>%
  as.data.frame()%>%
  select(Quarter,Electricity='.mean')%>%
   mutate(model="Arima111"))
ggplot(mapping=aes(Quarter,Electricity))+
  geom_line(data=observed_data%>%filter(year(Quarter) > 2000)) +
  geom_line(data=forecast_data,
            aes(group=model,color=factor(model)),
            inherit.aes = TRUE,
            linewidth=0.3)+
  scale_color_viridis_d(begin = 0,end = 0.5,direction = -1)+
  labs(y = "kWt (billion)", 
       color="Model",
       title = "Australian Electricity 2-model forecast (24 months)")

ETS (ExponenTial Smoothing)

aus_production |>
  select(Quarter,Electricity) |>
  stretch_tsibble(.init = 10) |>
  model(
    ETS(Electricity),
    ARIMA(Electricity)
  ) |>
  forecast(h = 8) |>
  fabletools::accuracy(aus_production) |>
  select(.model, RMSE:MAPE)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 8 observations are missing between 2010 Q3 and 2012 Q2
## # A tibble: 2 × 5
##   .model              RMSE   MAE   MPE  MAPE
##   <chr>              <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Electricity) 1033.  715. 0.310  2.50
## 2 ETS(Electricity)   1044.  719. 0.587  2.59
# Estimate parameters
fit4 <- aus_production |>
  model(ETS(Electricity ~ error("A") + trend("Ad") + season("A")))
fc <- fit4 |>
  forecast(h = 8)

fc |>
  autoplot(aus_production) +
  geom_line(aes(y = .fitted), col="#D55E00",
            data = augment(fit)) +
  labs(y="kWt (billion)", title="Electricity: Australia") +
  guides(colour = "none")