-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrossValid1.R
134 lines (109 loc) · 4.82 KB
/
CrossValid1.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
CrossValid1<- function(X, y, nNeg, nPos, i2) {
# Leave-m-out cross-validation of a previously estimated stepwise regression model.
# D Meko
# last revised 2024-03-08
#
# IN
# X is matrix of potential predictors
# y is vector of predictand
# nNeg and nPos are the maximum negative and positive lags that were allowed in
# the stepwise regression model of predictand on chronologies. These settings
# are used by function LeaveOut to compute m=1+4*max(nNeg,nPos), the nunber
# of observations to be omitted in each iteration of leave-m-out cross-
# validation
# i2 is order that columns of matrix of potential predictors entered model
# in calling script or function, which is assumed to have applied stepwise
# regression; that order is repeated here in the stepwise cross-validation
#
# OUT
# CV [named list] statistics for maximim-RE model from forward stepwise
# Names self-describing. Includes step of maximum RE; columns of the original
# matrix of potential predictors in the maximum-RE model; reduction of error
# statistic, validation root-mean-square errot, cross-validation predictions,
# cross-validation residuals of the final model; and number of observations
# left out in leave-m-out cross-validation. Also in CV is the numerical
# vector REcvAll of RE at each step of the modeling, and corresponding vector
# RMSEVall or RMSE at each step
#
# NOTES
#
# Differs from CrossValid2 in that computes vector of statistics (e.g. REall) for
# each step of a previously run stepwise regression. In contrast, CrossValid2
# deals just with the simpler case of one model (e.g. scalar RE). CrossValid1
# also is more complicated than CrossValid1 because CrossValid2 has to identify
# the "best" model (by max RE)
#
# Absolutely important that input column pointer i1 indicates columns of X
# in order (left to right) as they entered stepwise in the regression assumed
# to have been done before calling this function.
#
# revised 2024-03-08: cosmetic; expansion and clarification of comments
library("pracma") # needed for emulation of Matlab "backslash" operator through
# QR decomposition
source(paste(code_dir,"LeaveOut.R",sep="")) # form pointer matrix for leave-m-out cross-validation
#--- Build pointer matrix for predictor sets
X<-as.matrix(X)
mX <-dim(X)[1]
y <- as.matrix(y)
H<-LeaveOut(nNeg,nPos,mX)
Lin <- H$Lin
mOut <- H$NumberLeftOut
#### Cross-validation modeling
#... Storage for models for all steps
E1 <- matrix(NA,mX,length(i2)) # to store cross-valid residuals for all models
P1 <- E1 # to hold cross-validation predictions ...
RMSEvAll <- rep(NA,length(i2)) # to store RMSEv for all models
REall <- RMSEvAll # ... RE ...
for (k in 1:length(i2)){
#--- Storage for various 1-col matrices specific to k-step model
w1 <- matrix(NA,mX,1) # cv predictions
w2 <- w1 # null predictions (equal to calibration means)
ithis <- i2[1:k]
#--- long-term predictor matrix
Xthis <- as.matrix(X[,ithis])
a1this <- matrix(1,mX,1)
Xthis <- cbind(a1this,Xthis)
for (n in 1:mX){
#--- Build predictor matrix
Lthis <- Lin[,n]
Lthis <- as.logical(as.matrix(Lthis))
nthis <- sum(Lthis)
u <- as.matrix(X[Lthis,ithis])
a1 <- matrix(1,nthis,1)
U <- cbind(a1,u) # predictor matrix
# Build predictand as 1-col matrix
v <- as.matrix(y[Lthis])
#--- Matrix left division to estimate regression parameters
b <- mldivide(U,v) # [matrix, 1 col, with coefficients, constant term first]
#--- Estimated predictand for central "left-out" observation
vhat1 <- Xthis[n,] %*% b
w1[n] <- vhat1;
w2[n] <- mean(v)
}
#--- Residuals time series
e1<-y-w1 # cross-validation
e2 <- y-w2 # null-model (using calib means as predictions)
#--- Validaton statistics
SSE1 <- sum(e1*e1) # sum of squares of cross-validation errors
MSE1 <- SSE1/mX # mean square error of cross-validation
RMSEv <- sqrt(MSE1) # root-mean-square error of cross-validation
SSE2 <- sum(e2*e2) # sum of square of null-model residuals
RE <- 1 - SSE1/SSE2 # reduction of error statistic
#--- Store statistics for model step
E1[,k] <- e1
P1[,k] <- w1
RMSEvAll[k]<- RMSEv
REall[k] <-RE
}
#--- Find maximum-RE model and its statistics
kmax <- which.max(REall) # at this step
i2cv <- i2[1:kmax]
REwinner <- REall[kmax]
Pcv <- as.matrix(P1[,kmax]) # cv predictions
Ecv <- as.matrix(E1[,kmax]) # cv errors
RMSEcv <- RMSEvAll[kmax]
CV <- list("REmaxStep"=kmax,"ColumnsIn"=i2cv,"REcvAll"=REall,"REcv"=REwinner,
"CVpredictions"=Pcv,"CVresiduals"=Ecv,"RMSEvall"=RMSEvAll,"RMSEcv"=RMSEcv,
"LeftOut"=mOut)
return(CV)
}