-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathForwStep2.R
executable file
·171 lines (151 loc) · 6.52 KB
/
ForwStep2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
ForwStep2 <- function(X,namesX, y, c1) {
# Forward stepwise regression
# D. Meko
# Last revised 2024-03-04
#
# The forward stepwise entry is done until all variables are in. The final model
# is selected as the model for step at which adjusted R-square is maximum
#
# INPUT ARGUMENTS
# y [matrix] 1-col of predictand
# X [matrix] one-or-more cols of pool of potential predictors
# namesX [character] vector of ids of potential predictors (what's in cols of X)
# c1 [1x1]r required incremental increase in adjusted R-square to warrant additional
# step in forward stepwise
#
# OUTPUT
# H: named list,
# names(H)<-c('Model','StepMaxR2adj','ColsInModel','Coefficients',
# 'ColsInOrderEntry','Rsquared','RsquaredAdj','Step',
# 'RsquaredAllSteps','RsquaredAdjAllSteps','Foverall',
# 'Fpvalue')
# Model [lm object] is a special type of R object, and holds many of the
# regression statistics and data, such as the residuals and predicted data.
# R has functions that operate on lm objects. For example,
# summary(H#Model) shows a summary table of the regression
# The Coefficients, combined with ColsInModel would allow a reconstruction to
# be generated from the long time series of potenial predictors X.
# ColsInModel gives the columns of that matrix that the coefficients apply
# to. By plotting RsquaredAllSteps or RsquaredAdjAllSteps agains step, you
# can disply how R-square and adjusted R-square changes with step in the
# stepwise modelin.
# Most of the other list items are obvious from their names. StepMaxR2adj is
# the step at which adjusted R-square reaches a maximum.
#
# revised 2024-03-04: corrected error in call to ForwStep2 for stepwise variables entering
# after the first to to enter. At "r<-cor(e,X[,ibullpen])". Error could have resulted in
# MLR regression models picking next variable (not in model) most highly correlated
# with predictand, y, rather than with residuals, e, from previous step. Change could
# result in model with stronger statistics (e.g., R-squared)
source(paste(code_dir,"Fpvalue.R",sep="")) # p-value of overall-F from lm() [not written by Meko]
#--- ALLOCATE AND INITIALIZE
Np<-dim(X)[2] # size predictor pool
i1<-1:Np # index to predictors in pool
i2<-rep(NA,Np) # index in order of entry
Lin=rep(FALSE,Np) # initial for in model
R2<-rep(NA,Np)
R2a<-rep(NA,Np)
#--- FOREWARD STEPWISE REGRESSION
for (j in 1:Np){
if (j==1){
# First, pass pick X correlated highest (absolute R) with y
colnames(X)<-namesX # re-initialize these
r<-cor(y,X)
H<-sort(abs(r),decreasing=TRUE,index.return=TRUE)
iwinner<-H$ix[1]
i2[j]<-iwinner
Lin[iwinner]<-TRUE # Logical to cols of X in model to be estimated
U<-X[,Lin] # culll matrix of those predictors
G<-lm(y~U) # estimate model (G is a "lm" object )
R2this<-summary(G)$r.squared
R2adjthis<-summary(G)$adj.r.squared
R2[j]<-R2this # store R-squared for this model
R2a[j]<-R2adjthis # store adjusted R-squared ...
e<-as.matrix(G$residuals) # model residuals;
} else {
# iteration after the first. e are the residuals of the previous step.
# At each step, e is correlated with the cols of X not yet in model.
# "Chosen one" is the the col of X with highest correlations
colnames(X)<-namesX # re-initialize these
ibullpen <-i1[!Lin] # cols of X NOT yet in models
r<-cor(e,X[,ibullpen]) # rev 2024-03-04
H<-sort(abs(r),decreasing=TRUE,index.return=TRUE)
iwinner<-H$ix[1] # pointer to highes correlated member of sub-matrix
i2[j]<-ibullpen[iwinner] # corresponding column of that member in X
ithis<-ibullpen[iwinner] #... renamed for simplicity
Lin[ithis]<-TRUE # logical of cols of X in current model to be estimated
# Next statements exactly same as those described for first iteration
U<-X[,Lin]
G<-lm(y~U) # estimation
R2this<-summary(G)$r.squared
R2adjthis<-summary(G)$adj.r.squared
R2[j]<-R2this
R2a[j]<-R2adjthis
e<-as.matrix(G$residuals)
}
}
#--- FIND "BEST" MODEL AND RE-FIT IT
rm(H)
Lin<-rep(FALSE,Np) # re-initialize as no variables in model
# MAXIMUM ADJUSTED R-SQUARED VERSION
# Commented out code for using maximum adjusted R2 as criterion for best
# model. For large number of variables in pool of potential predictos, and
# with those possibly chosen by correlation screening from larger number
# of possible predictors, this criterion tends to over-fit model. Decided
# to stop entry instead FIRST maximimum of adjusted R2
#
# s<-sort(R2a,decreasing=TRUE,index.return=TRUE) # sorted adjusted R-squared, dec.
# iwinner<-s$ix[1] # step at which adjusted R-squared highest
# i2a<-i2[1:iwinner] # cols of X in model to be fit
# Lin[i2a]<-TRUE # turn on logical for cols of X in model to be fit
# FIRST LOCAL MAXIMUM OF ADJUSTED R-SQUARED
if (length(Lin)==1){
# only one predictor in pool; only one step
iwinner<-1
} else {
d<-diff(R2a) # first difference of adjusted R-squared with step in model
if (all(d>=c1)){
# Adjusted R-squared always increasing by at least c1 with step
iwinner<-length(Lin)
} else {
# adjusted R-squared increases by less than c1 at some step (step ithis+1)
ithis<-which(d < c1) # find step of first "insufficient" increase in adjusted R-squared
iwinner<-ithis[1]
}
}
i2a<-i2[1:iwinner] # cols of X in final model
Lin[i2a]<-TRUE # turn on logical for cols of X in model to be fit
#---- PULL SUBMATRIX OF PREDICTORS AND ESTIMATE MODEL
# if adjR2 begins dropping from 1st step, only 1 series enters matrix...
if(length(i2a)==1){
U <- matrix(X[,Lin])
}
else{
U <- X[,Lin]
}
colnames(X)<-namesX # re-initialize these
colnames(U)<-colnames(X)[Lin]
G<-lm(y~U) # estimation
#--- STATISTICS OF FINAL FITTED MODEL
kstep<-sum(Lin) # final step (max adj R-square)
H = list()
H[[1]]<-G
H[[2]]<-kstep
H[[3]]<-i1[Lin]
H[[4]]<-G$coefficients
H[[5]]<-i2[1:kstep]
H[[6]]<-summary(G)$r.squared
H[[7]]<-summary(G)$adj.r.squared
jstep<-(1:Np)
H[[8]]<-jstep
H[[9]]<-R2
H[[10]]<-R2a
H[[11]]<-summary(G)$fstatistic[1]
p<-Fpvalue(G)
H[[12]]<-p # pvalue of overall F
names(H)<-c('Model','StepMaxR2adj','ColsInModel','Coefficients',
'ColsInOrderEntry','Rsquared','RsquaredAdj','Step',
'RsquaredAllSteps','RsquaredAdjAllSteps','Foverall',
'Fpvalue')
return(H)
}