统一设置ggplot2的绘图风格

library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))

7.1:时间序列的相关检验

白噪声检验

如果时间序列数据没有通过白噪声检验,则说明该序列为随机数序列,则没有建立时间序列模型进行分析的必要。

单位根检验

用来判断时间序列是否为平稳序列

协整检验和Granger因果检验

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(tseries)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 模拟一组随机数据和非随机数据进行白噪声检验
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
AirPas$Month <- as.yearmon(AirPas$Month)
set.seed(123) ## 生成一组随机数据
AirPas$randdata <- round(rnorm(nrow(AirPas),mean = mean(AirPas$Passengers),
                               sd=50))
head(AirPas)
##     Month Passengers randdata
## 1  1 1949        112      252
## 2  2 1949        118      269
## 3  3 1949        132      358
## 4  4 1949        129      284
## 5  5 1949        121      287
## 6  6 1949        135      366
# ## 转化为时间序列数据
# Air_ts <- ts(AirPas$Passengers,start = c(1949,1),deltat = 1/12)
# plot.ts(Air_ts)
# ## 可视化两组时间序列数据
# Air_tsdf <- tsdf(Air_ts,colname = "month")
# head(Air_tsdf)
# set.seed(123)
# Air_tsdf$randdata <- round(rnorm(nrow(AirPas),mean = mean(AirPas$Passengers), 
#                                  sd=20))
# head(Air_tsdf)


## 可视化两组数据集
AirPas%>%gather(key = "dataclass",value = "Number",-Month)%>%
  ggplot(aes(x=Month,y=Number))+
  geom_line(aes(colour=dataclass,linetype = dataclass))+
  theme(legend.position = c(0.15,0.8))
## Don't know how to automatically pick scale for object of type yearmon. Defaulting to continuous.

## Ljung-Box 检验
Box.test(AirPas$Passengers,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  AirPas$Passengers
## X-squared = 132.14, df = 1, p-value < 2.2e-16
## p-value < 2.2e-16 说明该序列为非随机数据

Box.test(AirPas$randdata,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  AirPas$randdata
## X-squared = 0.13193, df = 1, p-value = 0.7164
## p-value = 0.6978 说明该序列为随机数据,即为白噪声


## 单位根检验,检验时间序列的平稳性
## ADF检验的零假设为有单位根

## 如果一个时间序列是平稳的,则通常只有随机成分,常用ARMA模型来预测未来的取值,
## 如果将一组数据经过d次差分后可以将不平稳的序列准化为平稳的序列,则称序列为d阶单整。

## 生成ARIMA(2,2,2)的时间序列数据,进行单位根检验演示
adfdata <- arima.sim(list(order = c(2,2,2),ar = c(0.8897, -0.4858),
                          d=2,ma = c(-0.2279, 0.2488)),n = 200)
diff1 <- diff(adfdata)
diff2 <- diff(diff1)
diff3 <- diff(diff2)
## 可视化4种曲线
par(mfrow=c(2,2),family = "STKaiti")
plot(adfdata,main="ARIMA(2,2,2)")
plot(diff1,main="差分1次")
plot(diff2,main="差分2次")
plot(diff3,main="差分3次")

## 进行单位根检验
adf.test(adfdata)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  adfdata
## Dickey-Fuller = -0.38412, Lag order = 5, p-value = 0.9862
## alternative hypothesis: stationary
##  p-value = 0.9684 说明数据不是平稳的
adf.test(diff1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1
## Dickey-Fuller = -1.959, Lag order = 5, p-value = 0.5933
## alternative hypothesis: stationary
## p-value = 0.6465  说明一阶差分后数据不是平稳的
adf.test(diff2)
## Warning in adf.test(diff2): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2
## Dickey-Fuller = -6.8816, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
Box.test(diff2,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  diff2
## X-squared = 62.357, df = 1, p-value = 2.887e-15
## p-value = 0.01, 说明2阶差分后数据是平稳的,而且不是白噪声数据
## 可以对原始数据建立ARIMA(p,2,q)模型
Box.test(diff3,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  diff3
## X-squared = 0.36924, df = 1, p-value = 0.5434
## 3阶差分后的数据已经是白噪声数据,没有分析的价值

7.2:自回归移动平均模型

如果一个时间序列数据是平稳的,那么可以使用ARMA模型来预测未来的数据

library(ggfortify)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(forecast)
## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fitted.fracdiff        fracdiff 
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify
##   residuals.fracdiff     fracdiff
# library(TSA)
## 读取数据
ARMAdata <- read.csv("data/chap7/ARMAdata.csv")
ARMAdata <- ts(ARMAdata$x)
plot.ts(ARMAdata)

autoplot(ARMAdata)+ggtitle("序列变化趋势")

## 白噪声检验
Box.test(ARMAdata,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ARMAdata
## X-squared = 62.357, df = 1, p-value = 2.887e-15
## p-value = 4.552e-15 ,说明不是白噪声

## 平稳性检验,单位根检验
adf.test(ARMAdata)
## Warning in adf.test(ARMAdata): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ARMAdata
## Dickey-Fuller = -6.8816, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
## p-value = 0.01,说明数据是平稳的

## 分析序列的自相关系数和偏自相关系数确定参数p和q
p1 <- autoplot(acf(ARMAdata,lag.max = 30,plot = F))+
  ggtitle("序列自相关图")
p2 <- autoplot(pacf(ARMAdata,lag.max = 30,plot = F))+
  ggtitle("序列偏自相关图")
gridExtra::grid.arrange(p1,p2,nrow=2)

## 偏自相关图3阶后截尾,可以认为p的取值为3左右,
## 自相关图5阶后截尾,可以认为q的取值为5左右,

## 通过观察自相关系数和偏自相关系数虽然可以确定p和q,但是这不是最好的方法,
## R提供了自动寻找序列合适的参数的函数
auto.arima(ARMAdata)
## Series: ARMAdata 
## ARIMA(3,0,0) with zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3
##       0.6641  -0.1544  -0.2454
## s.e.  0.0684   0.0823   0.0685
## 
## sigma^2 estimated as 0.9333:  log likelihood=-275.78
## AIC=559.56   AICc=559.76   BIC=572.75
## 可以发现较好的ARMA模型为ARMA(2,1)

## 对数据建立ARMA(2,1)模型,并预测后面的数据
ARMAmod <- arima(ARMAdata,order = c(2,0,1))
summary(ARMAmod)
## 
## Call:
## arima(x = ARMAdata, order = c(2, 0, 1))
## 
## Coefficients:
##          ar1      ar2      ma1  intercept
##       1.2059  -0.6074  -0.5455     0.1003
## s.e.  0.1030   0.0637   0.1217     0.0772
## 
## sigma^2 estimated as 0.9168:  log likelihood = -275.5,  aic = 561.01
## 
## Training set error measures:
##                       ME      RMSE       MAE       MPE     MAPE      MASE
## Training set 0.001170727 0.9574957 0.7646117 -38.47298 349.0644 0.8157659
##                     ACF1
## Training set -0.01911556
## 对拟合残差进行白噪声检验
Box.test(ARMAmod$residuals,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ARMAmod$residuals
## X-squared = 0.074183, df = 1, p-value = 0.7853
## p-value = 0.7853 ,说明是白噪声

## 可视化模型未来的预测值
par(family = "STKaiti")
plot(forecast(ARMAmod,h=20))

7.3:季节ARIMA模型

## 对飞机乘客数据建立季节趋势的ARIMA模型进行预测未来的数据
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
## 处理为时间序列数据
AirPas$Month <- as.yearmon(AirPas$Month)
AirPas <- ts(AirPas$Passengers,start = AirPas$Month[1],frequency = 12)
head(AirPas)
##      Jan Feb Mar Apr May Jun
## 1949 112 118 132 129 121 135
## 可视化序列
autoplot(AirPas)+ggtitle("飞机乘客数量变化趋势")

## 将数据即切分位两个部分,一部分用于训练模型,一部分用于查看预测效果
AirPas_train <- window(AirPas,end=c(1958,12))
AirPas_test <- window(AirPas,star=c(1959,1))
adf.test(AirPas_train,k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  AirPas_train
## Dickey-Fuller = -1.8449, Lag order = 12, p-value = 0.641
## alternative hypothesis: stationary
adf.test(diff(AirPas_train),k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(AirPas_train)
## Dickey-Fuller = -1.9293, Lag order = 12, p-value = 0.6059
## alternative hypothesis: stationary
adf.test(diff(diff(AirPas_train)),k=12)
## Warning in adf.test(diff(diff(AirPas_train)), k = 12): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(diff(AirPas_train))
## Dickey-Fuller = -7.6169, Lag order = 12, p-value = 0.01
## alternative hypothesis: stationary
## 说明数据延迟12阶,原始数据和差分一次数据都有单位根,而差分两次后数据是平稳的

AirPasdiff2 <- diff(diff(AirPas_train))
## 分析序列的自相关系数和偏自相关系数分析参数p和q
p1 <- autoplot(acf(AirPasdiff2,lag.max = 40,plot = F))+
  ggtitle("序列自相关图")
p2 <- autoplot(pacf(AirPasdiff2,lag.max = 40,plot = F))+
  ggtitle("序列偏自相关图")
gridExtra::grid.arrange(p1,p2,nrow=2)

## 从自相关图和偏自相关图可以很明显的发现数据可能具有周期性,
## 不能很好的确定参数p和q的取值,根据图可知,序列可能具有年周期性
## 使用auto.arima()函数确定模型的参数
auto.arima(AirPas_train)
## Series: AirPas_train 
## ARIMA(1,1,0)(0,1,0)[12] 
## 
## Coefficients:
##           ar1
##       -0.2397
## s.e.   0.0935
## 
## sigma^2 estimated as 103.6:  log likelihood=-399.64
## AIC=803.28   AICc=803.4   BIC=808.63
## 最好的模型为ARIMA(1,1,0)(0,1,0)[12] 

ARIMA <- arima(AirPas_train, c(1, 1, 0),
              seasonal = list(order = c(0, 1, 0),period = 12))
summary(ARIMA)
## 
## Call:
## arima(x = AirPas_train, order = c(1, 1, 0), seasonal = list(order = c(0, 1, 
##     0), period = 12))
## 
## Coefficients:
##           ar1
##       -0.2397
## s.e.   0.0935
## 
## sigma^2 estimated as 102.7:  log likelihood = -399.64,  aic = 803.28
## 
## Training set error measures:
##                       ME     RMSE      MAE         MPE    MAPE     MASE
## Training set -0.01614662 9.567988 7.120167 -0.03346415 2.90195 0.321312
##                    ACF1
## Training set 0.00821521
Box.test(ARIMA$residuals,type ="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ARIMA$residuals
## X-squared = 0.0083029, df = 1, p-value = 0.9274
## p-value = 0.9274,此时,模型的残差已经是白噪声数据,数据中的信息已经充分的提取出来了

## 可视化模型的预测值和这是值之间的差距
par(family = "STKaiti")
plot(forecast(ARIMA,h=24),shadecols="oldstyle")
points(AirPas_test,col = "red")
lines(AirPas_test,col = "red")

7.4:多元时间序列ARIMAX模型

library(readxl)
# library(TSA)
## 读取数据
gasco2 <- read_excel("data/chap7/gas furnace data.xlsx")
head(gasco2)
## # A tibble: 6 x 2
##   GasRate `C02%`
##     <dbl>  <dbl>
## 1  -0.109   53.8
## 2   0       53.6
## 3   0.178   53.5
## 4   0.339   53.5
## 5   0.373   53.4
## 6   0.441   53.1
GasRate <- ts(gasco2$GasRate)
CO2 <- ts(gasco2$`C02%`)

p1 <- autoplot(GasRate)
p2 <- autoplot(CO2)
gridExtra::grid.arrange(p1,p2,nrow=2)

##  切分位训练集和测试集
trainnum <- round(nrow(gasco2)*0.8)
GasRate_train <- window(GasRate,end = trainnum)
GasRate_test <- window(GasRate,start = trainnum+1)
CO2_train <- window(CO2,end = trainnum)
CO2_test <- window(CO2,start = trainnum+1)


## 自动寻找合适的p,q
auto.arima(y=CO2_train,xreg = GasRate_train)
## Series: CO2_train 
## Regression with ARIMA(2,1,2) errors 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    xreg
##       1.4773  -0.7232  -0.3818  0.2742  0.0520
## s.e.  0.0858   0.0710   0.1011  0.0881  0.1113
## 
## sigma^2 estimated as 0.1006:  log likelihood=-62.46
## AIC=136.92   AICc=137.29   BIC=157.7
# ARIMAXmod <- arimax(CO2_train,order = c(2,1,2),xreg = GasRate_train)
#ARIMAXmod <- arima(CO2_train,order = c(2,1,2),xreg = GasRate_train)
ARIMAXmod <- Arima(CO2_train,order = c(2,1,2),xreg = GasRate_train)
summary(ARIMAXmod)
## Series: CO2_train 
## Regression with ARIMA(2,1,2) errors 
## 
## Coefficients:
##          ar1      ar2      ma1     ma2    xreg
##       1.4773  -0.7232  -0.3818  0.2742  0.0520
## s.e.  0.0858   0.0710   0.1011  0.0881  0.1113
## 
## sigma^2 estimated as 0.1006:  log likelihood=-62.46
## AIC=136.92   AICc=137.29   BIC=157.7
## 
## Training set error measures:
##                       ME      RMSE       MAE         MPE      MAPE      MASE
## Training set 0.003694667 0.3131359 0.2320132 0.009857144 0.4359391 0.3913876
##                     ACF1
## Training set -0.00032096
## 可视化模型的预测值和真实值之间的差距
par(family = "STKaiti")
plot(forecast(ARIMAXmod,h=length(GasRate_test),xreg = GasRate_test))
lines(CO2_test,col="black")

7.5:prophet预测时间序列

library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
library(zoo)
## 读取数据并对数据重新命名
AirPas <- read.csv("data/chap7/AirPassengers.csv",stringsAsFactors=FALSE)
colnames(AirPas) <- c("ds","y")
AirPas$ds <- as.yearmon(AirPas$ds)
head(AirPas)
##        ds   y
## 1  1 1949 112
## 2  2 1949 118
## 3  3 1949 132
## 4  4 1949 129
## 5  5 1949 121
## 6  6 1949 135
## 建立具有季节趋势的模型
model <- prophet(AirPas,growth = "linear",
                 yearly.seasonality = TRUE,weekly.seasonality = FALSE,
                 daily.seasonality = FALSE,seasonality.mode = "multiplicative")
## 预测后面两年的数据,并将预测结果可视化
future <- make_future_dataframe(model, periods = 24,freq = "month")
forecast <- predict(model, future)
plot(model, forecast)

## 可视化预测的组成部分,主要有线性趋势和季节趋势
prophet_plot_components(model, forecast)