-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTSAprojPartA.m
339 lines (242 loc) · 10.8 KB
/
TSAprojPartA.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
%% PROJECT TSA 2019 with Joel Bluhme
%% PART A Conclusions:
% A good ARMA-model is saved in M1 and the steps for getting to it
% and the predictions with it on val and test data is found below
% M1:
% A(z) = 1 - 1.342 (+/- 0.03405) z^-1
% + 0.4108 (+/- 0.03324) z^-2
% - 0.167 (+/- 0.02166) z^-23
% - 0.2308 (+/- 0.05447) z^-24
% + 0.3484 (+/- 0.0399) z^-25
%
%
% C(z) = 1 + 0.05144 (+/- 0.03654) z^-1
% - 0.3417 (+/- 0.04016) z^-24
%
% As seen are all parameters highly significant except c1 which is a bit whithin
% the confidence interval (estimate+-2 stdv). It does however improve
% val.data predictions and is therefore kept.
clear
clc
%%
load('climate66.dat')
load('climate67.dat')
load('climate68.dat')
load('climate69.dat')
%% Following (appr 16 weeks) of temp. data from 1967 seems relatively stationary, we choose it to make up our
% modelling (11 weeks) and validation (3 weeks) and test data 1 and 2 (1 week each).
Mdldata=climate67(3400:5000,:); % Defines the data sets
Valdata=climate67(5001:5600,:);
Test1data=climate67(5601:5768,:);
Test2data=climate67(7501:7668,:);
Totdata=climate67(1:end,8);
T = linspace(datenum(1967 ,1 ,1) ,datenum(1967 ,12 ,31), length(Totdata)); % Create grid in month
plot(T, Totdata);
datetick('x');
%% ARMA modelling
%% Step 1a Check whether transformation of data is reasonable
OptLambda=bcNormPlot(climate67(1:8500,8)) % =1.07 so no transformation seems necessary
% looks at the entire choosen data set
%% Step 1b make the process y zero mean
y=Mdldata(:,8);
y=y-mean(y);
%% Step 2 Examine the pacf and acf of y
figure(1)
phi = pacf( y, 100,0.05, 1, 0 );
title("PACF for y");
figure(2)
rho = acf( y, 100,0.05, 1, 0 );
title("ACF for y");
figure(3)
normplot(phi)
title("Normplot of pacf"); % estimated pacf seems gaussian -> confidence intervals are reliable
%% Step 3 From ACF we see a clear 24 periodicity and from pacf two prominent peaks at lag 1 and 2
%%% Let us therefore try model 1, M1: AR(2) combined with 24-differentiator
% After removing insign. lag 26 and adding lag 23
% and adding 1 and 24 MA (lag 2 was insignificant) lags we conclude that
% M1 is a reasonably good model
data=iddata(y);
AS=[1 zeros(1,23) -1]; % Define our model polynomials
A=conv([1 0 0],AS);
C=[1 1 0 zeros(1,21) 1];
ar=idpoly(A,[],C); % Estimate model M1 and the resulting residual
ar.Structure.a.Free = [0 1 1 A(4:end-4) 1 1 1 0];
ar.Structure.c.Free = C;
M1 = pem(data,ar);
r1=resid(M1,data);
figure(1)
phi = pacf( r1.y, 100,0.05, 1, 0);
title("PACF for res");
figure(2)
rho = acf( r1.y, 100,0.05, 1, 0,0 );
title("ACF for res");
figure(3)
whitenessTest(r1.y,0.01)
present(M1)
% The test statistics in the whiteness test are relatively good for being
% real data and both acf, pacf and cumPer of residuals look good
% The val-data predictions below look good with this model too
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Step 4 Prediction on validation set
%% 1-step prediction
k=1;
[F,G]=Diophantine(M1.c,M1.a,k)
SF=50; % Safety factor, begin predicting SF time units before val to handle the initial corruptness of the data
% yhat_1(1)=prediction of y(end-SF+1)=ynew(2) thus
% yhat_1(1+SF)=prediction of ynew(SF+2)=yval(1) ie the first "wanted" prediction
% yhat_1(end-1)=prediction of yval(end) ie the last "wanted" pred.
y=Mdldata(:,8); % Our zero mean modelling data vector
y=y-mean(y);
yval=Valdata(:,8); % Our zero mean adj. validation data vector
yval=yval-mean(Mdldata(:,8)); % We subtract the modelling mean not validation mean since the former is the mean we assume to be true
% ie our model is that the difference Temperature-mean(Modelling set) is an ARMA process
ynew=[y(end-SF:end); yval]; % We filter this concatenated vector
yhat_1=filter(G,M1.c,ynew);
figure(1) % We plot the predicted vs true with the mean added
plot(yhat_1(1+SF:end-1)+mean(Mdldata(:,8))) % discard the first SF predictions
hold on
plot(yval(1:end)+mean(Mdldata(:,8)))
legend('1-step pred','True value')
pe1=yval(1:end)-yhat_1(1+SF:end-1); % 1-step pred error
hold off
figure(2)
rho = acf( pe1, 100,0.05, 1, 0,0);
title("ACF for pe1");
figure(3)
whitenessTest(pe1,0.01)
% The 1-step prediction residuals of validation data set look extremely white
% thus our model probably is very good
V_pe1=var(pe1) % =0.3621
mean(pe1) % =0.0455
% Passes almost all whitenessTests!!!!!
%% 7-step prediction
k=7;
[F,G]=Diophantine(M1.c,M1.a,k)
SF=50; % Safety factor, begin predicting SF time units before val to handle the initial corruptness of the data
y=Mdldata(:,8); % Our zero mean modelling data vector
y=y-mean(y);
yval=Valdata(:,8); % Our zero mean adj. validation data vector
yval=yval-mean(Mdldata(:,8));
% Crucial that the predictions and true values are in line, it can be seen that:
% yhat_7(1)= prediction of y(end-SF+k) = y(end-SF+(k-1)+1)=ynew(1+k)
% yhat_7(2)= prediction of y(end-SF+k+1) = ynew(2+k)
% ...
% yhat_7(2+SF-k)=prediction of ynew(2+SF)=yval(1) ie the first "wanted" prediction
% yhat_7(end-k)=prediction of yval(end) ie the last "wanted" pred.
ynew=[y(end-SF:end); yval]; % We concatenate the last SF+1 values of the modelling data vector and the validation vector
yhat_7=filter(G,M1.c,ynew); % We filter this concatenated vector
figure(1)
plot(yhat_7(2+SF-k:end-k)+mean(Mdldata(:,8)))
hold on
plot(yval(1:end)+mean(Mdldata(:,8)))
legend('7-step pred','True value')
pe7=yval(1:end)-yhat_7(2+SF-k:end-k); % 7-step pred error
hold off
figure(2)
rho = acf( pe7, 100,0.05, 1, 6 );
title("ACF for pe7");
V_pe7=var(pe7) % =3.7104
TV_pe7=F'*F*V_pe1 % =3.9 ie higher than actual
mean(pe7) % =0.4101 thus we see that our model underestimate the temperature,
% but not by much. This is which is logical since
% we modelled during late spring early summer and our validation is over
% end of july and august which usually brings higher temperatures.
% This can be fixed with a state space model that continously reestimates
% model paramters
% We could also try to make a slighlty smaller modelling set
%% 26-step prediction
k=26;
[F,G]=Diophantine(M1.c,M1.a,k)
SF=50; % Safety factor, begin predicting SF time units before val to handle the initial corruptness of the data
y=Mdldata(:,8); % Our zero mean modelling data vector
y=y-mean(y);
yval=Valdata(:,8); % Our zero mean adj. validation data vector
yval=yval-mean(Mdldata(:,8));
% Crucial that the predictions and true values are in line, it can be seen that:
% yhat_k(1)= prediction of y(end-SF+k) = y(end-SF+(k-1)+1)=ynew(1+k)
% yhat_k(2)= prediction of y(end-SF+k+1) = ynew(2+k)
% ...
% yhat_k(2+SF-k)=prediction of ynew(2+SF)=yval(1) ie the first "wanted" prediction
% yhat_k(end-k)=prediction of yval(end) ie the last "wanted" pred.
ynew=[y(end-SF:end); yval]; % We concatenate the last SF+1 values of the modelling data vector and the validation vector
yhat_26=filter(G,M1.c,ynew); % We filter this concatenated vector
figure(1)
plot(yhat_26(2+SF-k:end-k)+mean(Mdldata(:,8)))
hold on
plot(yval(1:end)+mean(Mdldata(:,8)))
legend('26-step pred','True value')
pe26=yval(1:end)-yhat_26(2+SF-k:end-k); % 26-step pred error
hold off
% 26 step pred. follows the wiggling of the data amazingly well
figure(2)
rho = acf( pe26, 100,0.05, 1, 25 );
title("ACF for pe26");
V_pe26=var(pe26) % =4.7964
TV_pe26=F'*F*V_pe1 % =5.2584 ie higher than actual
mean(pe26) % =0.7774 underestimates temperature
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% Prediction on test sets
% Define test sets should be one week each:
Test1data=climate67(5601:5768,:);
Test2data=climate67(7501:7668,:);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 7-step pred on test1
k=7;
[F,G]=Diophantine(M1.c,M1.a,k)
SF=50; % Safety factor, begin predicting SF time units before val to handle the initial corruptness of the data
yval=Valdata(:,8); % Our zero mean adj. validation data vector
yval=yval-mean(Mdldata(:,8));
ytest1=Test1data(:,8);
ytest1=ytest1-mean(Mdldata(:,8)); % Our zero mean test1 data
% Crucial that the predictions and true values are in line, it can be seen that:
% yhat_7(1)= prediction of yval(end-SF+k) = yval(end-SF+(k-1)+1)=ynew(1+k)
% yhat_7(2)= prediction of yval(end-SF+k+1) = ynew(2+k)
% ...
% yhat_7(2+SF-k)=prediction of ynew(2+SF)=ytest1(1) ie the first "wanted" prediction
% yhat_7(end-k)=prediction of ytest1(end) ie the last "wanted" pred.
ynew=[yval(end-SF:end); ytest1]; % We concatenate the last SF+1 values of the val data vector and the test1 vector
yhat_7=filter(G,M1.c,ynew); % We filter this concatenated vector
figure(1)
plot(yhat_7(2+SF-k:end-k)+mean(Mdldata(:,8)))
hold on
plot(ytest1(1:end)+mean(Mdldata(:,8)))
legend('7-step pred','True value')
pe7=ytest1(1:end)-yhat_7(2+SF-k:end-k); % 7-step pred error
hold off
figure(2)
rho = acf( pe7, 100,0.05, 1, 6 );
title("ACF for pe7");
V_pe7=var(pe7) % =2.2482
TV_pe7=F'*F*V_pe1 % =3.9 ie higher than actual
mean(pe7) % =0.40492;
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% 7-step pred on test2
k=7;
[F,G]=Diophantine(M1.c,M1.a,k)
SF=50; % Safety factor, begin predicting SF time units before val to handle the initial corruptness of the data
yprev=climate67(7401:7500,8); % Our zero mean adj. data vector of the 100 data points right before ytest2
yprev=yprev-mean(Mdldata(:,8));
ytest2=Test2data(:,8);
ytest2=ytest2-mean(Mdldata(:,8)); % Our zero mean test2 data
% Crucial that the predictions and true values are in line, it can be seen that:
% yhat_7(1)= prediction of yval(end-SF+k) = yval(end-SF+(k-1)+1)=ynew(1+k)
% yhat_7(2)= prediction of yval(end-SF+k+1) = ynew(2+k)
% ...
% yhat_7(2+SF-k)=prediction of ynew(2+SF)=ytest1(1) ie the first "wanted" prediction
% yhat_7(end-k)=prediction of ytest1(end) ie the last "wanted" pred.
ynew=[yprev(end-SF:end); ytest2]; % We concatenate the last SF+1 values of the prev data vector and the test2 vector
yhat_7=filter(G,M1.c,ynew); % We filter this concatenated vector
figure(1)
plot(yhat_7(2+SF-k:end-k)+mean(Mdldata(:,8)))
hold on
plot(ytest2(1:end)+mean(Mdldata(:,8)))
legend('7-step pred','True value')
pe7=ytest2(1:end)-yhat_7(2+SF-k:end-k); % 7-step pred error
hold off
figure(2)
rho = acf( pe7, 100,0.05, 1, 6 );
title("ACF for pe7");
V_pe7=var(pe7) % =1.6895 lower than on test1 due to lower temp and thus lower absolute variance
TV_pe7=F'*F*V_pe1 % =3.9 ie higher than actual
mean(pe7) % =-1.9377;