-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgpmodelfilter.m
384 lines (306 loc) · 14.2 KB
/
gpmodelfilter.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
classdef gpmodelfilter
%GPMODELFILTER Object to filter a population of multigene symbolic regression models.
%
% Usage:
%
% First, create a default filter object F
%
% F = GPMODELFILTER
%
% Next, set the properties of the filter. E.g. to keep only models
% that have an R^2 >= 0.7 (training data) but contain no more than 3
% input variables use:
%
% F.MINR2TRAIN = 0.7; F.MAXVARS = 3;
%
% Finally, apply the filter to the population of models in the GP
% struct:
%
% GPF = F.APPLYFILTER(GP);
%
% This returns a structure GPF which is functionally identical to GP
% except that that models not meeting the filter specifications have
% been removed.
%
% It also removes duplicate models whose genotypes are identical. All
% the usual GPTIPS functions such as POPBROWSER, RUNTREE, GPPOPVARS,
% GPPRETTY etc. can be applied to the filtered data structure GPF.
%
% Remarks:
%
% The filter has the following settings and defaults:
%
% MINR2TRAIN = 0 (keeps models attaining this R2 on the
% training data).
%
% MAXCOMPLEXITY = Inf (keeps models that have this level of
% expressional complexity or lower).
%
% PARETOFRONT = FALSE (true to keep only models on the Pareto
% front of performance and expressional complexity). Note that
% 'expressional complexity' is used to compute the front even if
% the GPTIPS run was actually performed using 'node count' as the
% measure of tree complexity.
%
% MAXVARS = Inf (keeps models containing this max number of input
% vars).
%
% MINVARS = 0 (keeps models containing this min number of input
% vars).
%
% INCLUDEVARS = [] (keeps models that include these input variables
% - a row vector containing the input indices).
%
% EXCLUDEVARS = [] (keeps models that do not contain these
% input variables - a row vector containing the input indices).
%
% REMOVEDUPLICATES = TRUE (removes duplicate genotypes from the
% population).
%
% Hence, the default filter object only removes duplicates.
%
% [GPF,MODELINDS] = F.APPLYFILTER(GP) does the same but also returns
% a Boolean vector MODELINDS which refers to the population indices
% in GP that survived the filtering process.
%
% Copyright (c) 2009-2015 Dominic Searson
%
% GPTIPS 2
%
% See also mergegp, genefilter, popbrowser, paretoreport
properties (SetAccess = public)
minR2train = 0; %the minimum R2 on the selected dataset
maxComplexity = Inf; %the maximum complexity of models to retain
paretoFront = false; %true to select only models on the pareto front
maxVars = Inf; %selects models containing a max number of input vars
minVars = 0; %selects models containing a minimum number of input vars
includeVars =[]; %row vector of inputs that the models must contain
excludeVars = []; %row vector of inputs that the models must not contain
removeDuplicates = true; %true to remove duplicate genotypes from population
end
methods
%set removeDuplicates property
function obj = set.removeDuplicates(obj, bool)
if ~islogical(bool)
disp('Error: removeDuplicates must either be set to true or false');
return;
end
obj.removeDuplicates=bool;
end
%set excludeVars property
function obj = set.excludeVars(obj,varList)
if isempty(varList)
obj.excludeVars = varList;
return;
end
if size(varList,1) > 1
disp('Error: supplied list must be a row vector of input variable numbers.');
return;
end
if any(find(varList <= 0))
disp('Error: 0 or negative numbers are not valid input variable numbers.');
return;
end
if numel(varList) ~= numel(unique(varList))
disp('Error: supplied list must not contain duplicate input variable numbers.');
return;
end
if ~isempty(intersect(varList,obj.includeVars))
disp('Error: supplied exclude list contains variables on the include list.');
return;
end
obj.excludeVars = varList;
end%includeVars
%set includeVars property
function obj = set.includeVars(obj,varList)
if isempty(varList)
obj.includeVars = varList;
return;
end
if size(varList,1) > 1
disp('Error: supplied list must be a row vector of input variable numbers.');
return;
end
if any(find(varList <= 0))
disp('Error: 0 or negative numbers are not valid input variable numbers.');
return;
end
if numel(varList) ~= numel(unique(varList))
disp('Error: supplied list must not contain duplicate input variable numbers.');
return;
end
if numel(varList) > obj.maxVars
disp('Error: supplied list must not exceed the maxVars filter property.');
return;
end
if ~isempty(intersect(varList,obj.excludeVars))
disp('Error: supplied include list contains variables on the exclude list.');
return;
end
obj.includeVars = varList;
end%includeVars
%set R2min property
function obj = set.minR2train(obj,r2min)
if ~isa(r2min,'double')
disp('Error: minimum R^2 training must be between 0 and 1.');
return;
end
if r2min < 0 || r2min > 1
disp('Error: minimum R^2 training must be between 0 and 1.');
return;
end
obj.minR2train = r2min;
end
%set maxVars property
function obj = set.maxVars(obj,maxvars)
if ~isa(maxvars,'double')
disp('Error: max. input vars must be greater than 0.');
return;
end
if maxvars < 1
disp('Error: max. input vars must be greater than 0.');
return;
end
if maxvars < obj.minVars
disp('Error: max. input vars must be equal to or greater than min. input vars.');
return;
end
obj.maxVars = maxvars;
end
%set minVars property
function obj = set.minVars(obj,minvars)
if ~isa(minvars,'double')
disp('Error: min. input vars must be 1 or greater');
return;
end
if minvars < 1
disp('Error: min. input vars must be 1 or greater');
return;
end
if minvars > obj.maxVars
disp('Error: min. input vars must be smaller than or equal to max. input vars.');
return;
end
obj.minVars = minvars;
end
%set maxComplexity property
function obj = set.maxComplexity(obj,maxc)
if ~isa(maxc,'double')
disp('Error: maximum complexity must be a number greater than 1.');
return;
end
if maxc < 1
disp('Error: maximum complexity must be a number greater than 1.');
return;
end
obj.maxComplexity = maxc;
end
%set pareto front property
function obj = set.paretoFront(obj,bool)
if ~islogical(bool)
disp('Error: paretoFront must either be set to true or false');
return;
end
obj.paretoFront = bool;
end
%function to apply the filter settings to a GP structure
function [gp,filterInds] = applyFilter(obj,gp)
if nargin < 2
error('Usage is APPLYFILTER(GP)');
end
if ~isfield(gp.fitness,'r2train')
error('GPMODELFILTER cannot find R^2 training data. GPMODELFILTER is intended for use with populations containing multigene regression models.');
end
if gp.runcontrol.pop_size > 1000
disp('Please wait, this may take a few moments...');
end
%always do r2 & complexity filter first
filterInds = (gp.fitness.r2train >= obj.minR2train) & (gp.fitness.complexity <= obj.maxComplexity);
locations = find(filterInds);
numLeft = numel(locations);
disp([num2str(numLeft) ' models passed R^2 training (>= ' num2str(obj.minR2train) ') and expressional complexity (<= ' int2str(obj.maxComplexity) ') filter ...']);
if numLeft == 0
gp = [];
return;
end
%pareto rank 1 filter
if obj.paretoFront
disp('Computing pareto front on training data...');
paretoInds = ndfsort_rank1([(1-gp.fitness.r2train) gp.fitness.complexity]);
filterInds = filterInds & paretoInds;
end
%next apply vars filters
if ~isinf(obj.maxVars) || obj.minVars || ~isempty(obj.includeVars) || ~isempty(obj.excludeVars)
locations = find(filterInds);
numLeft = numel(locations);
disp(['Applying variable filter to ' num2str(numLeft) ' remaining models ...']);
for i=1:numLeft
hvec = gpmodelvars(gp,locations(i));
vars = find(hvec);
numvars = numel(vars);
if numvars > obj.maxVars || numvars < obj.minVars
filterInds(locations(i)) = false;
else
if ~isempty(obj.excludeVars)
if ~isempty(intersect(vars,obj.excludeVars))
filterInds(locations(i)) = false;
end
end
if ~isempty(obj.includeVars)
intersection = intersect(vars,obj.includeVars);
if numel(intersection) < numel(obj.includeVars)
filterInds(locations(i)) = false;
end
end
end
end %end of loop through individuals
end
%if enabled, loop through remaining genotypes and remove
%duplicates.
if obj.removeDuplicates && ~gp.info.duplicatesRemoved
locations = find(filterInds);
numLeft = numel(locations);
disp(['Removing genotype duplicates from ' num2str(numLeft) ' remaining models ...']);
for i=1:numLeft
for j=1:numLeft
if i~=j && locations(i) && locations(j)
model_i = gp.pop{locations(i)};
model_j = gp.pop{locations(j)};
if numel(model_i) ~= numel(model_j)
continue
end
if isequal(sort(model_i),sort(model_j))
filterInds(locations(j)) = false;
locations(j)=0;
end
end
end
end
gp.info.duplicatesRemoved = true;
end%end of removeDuplicates
numModels = sum(filterInds);
if numModels == 0
disp('No models matching all filter criteria were found.');
gp=[];
return
end
gp.pop = gp.pop(filterInds);
gp.fitness.returnvalues = gp.fitness.returnvalues(filterInds);
gp.fitness.values = gp.fitness.values(filterInds);
gp.fitness.r2train = gp.fitness.r2train(filterInds);
if isfield(gp.fitness,'r2val')
gp.fitness.r2val = gp.fitness.r2val(filterInds);
end
if isfield(gp.fitness,'r2test')
gp.fitness.r2test = gp.fitness.r2test(filterInds);
end
gp.fitness.complexity = gp.fitness.complexity(filterInds);
gp.fitness.nodecount = gp.fitness.nodecount(filterInds);
gp.runcontrol.pop_size = numModels;
gp.info.filtered = true;
gp.info.lastFilter = obj;
gp.source = 'gpmodelfilter';
disp([num2str(numModels) ' models passed the filtering process.']);
end %applyFilter
end %methods
end %classdef