From 3ddb98b680385b69ff4f47b3fbbb84f8895ab892 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 14 Mar 2024 13:05:07 -0400 Subject: [PATCH 01/26] Fix multiview API and enable oblique multiview Signed-off-by: Adam Li --- sktree/ensemble/_multiview.py | 13 -- sktree/tree/_multiview.py | 17 +-- sktree/tree/_oblique_splitter.pxd | 11 +- sktree/tree/_oblique_splitter.pyx | 241 +++++++----------------------- 4 files changed, 60 insertions(+), 222 deletions(-) diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py index f1102b66a..8f9c52971 100644 --- a/sktree/ensemble/_multiview.py +++ b/sktree/ensemble/_multiview.py @@ -159,16 +159,6 @@ class MultiViewRandomForestClassifier( - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. - feature_combinations : float, default=None - The number of features to combine on average at each split - of the decision trees. If ``None``, then will default to the minimum of - ``(1.5, n_features)``. This controls the number of non-zeros is the - projection matrix. Setting the value to 1.0 is equivalent to a - traditional decision-tree. ``feature_combinations * max_features`` - gives the number of expected non-zeros in the projection matrix of shape - ``(max_features, n_features)``. Thus this value must always be less than - ``n_features`` in order to be valid. - feature_set_ends : array-like of int of shape (n_feature_sets,), default=None The indices of the end of each feature set. For example, if the first feature set is the first 10 features, and the second feature set is the @@ -270,7 +260,6 @@ def __init__( warm_start=False, class_weight=None, max_samples=None, - feature_combinations=None, feature_set_ends=None, apply_max_features_per_feature_set=False, ): @@ -287,7 +276,6 @@ def __init__( "max_leaf_nodes", "min_impurity_decrease", "random_state", - "feature_combinations", "feature_set_ends", "apply_max_features_per_feature_set", ), @@ -305,7 +293,6 @@ def __init__( self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.max_features = max_features - self.feature_combinations = feature_combinations self.feature_set_ends = feature_set_ends self.apply_max_features_per_feature_set = apply_max_features_per_feature_set diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 52b70c0df..a01ba986c 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -158,9 +158,6 @@ class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - feature_combinations : float, default=None - Not used. - ccp_alpha : non-negative float, default=0.0 Not used. @@ -226,9 +223,6 @@ class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object. - feature_combinations_ : float - The number of feature combinations on average taken to fit the tree. - feature_set_ends_ : array-like of int of shape (n_feature_sets,) The indices of the end of each feature set. @@ -248,10 +242,6 @@ class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): _parameter_constraints = { **DecisionTreeClassifier._parameter_constraints, - "feature_combinations": [ - Interval(Real, 1.0, None, closed="left"), - None, - ], "feature_set_ends": ["array-like", None], "apply_max_features_per_feature_set": ["boolean"], } @@ -278,7 +268,6 @@ def __init__( max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, - feature_combinations=None, ccp_alpha=0.0, store_leaf_values=False, monotonic_cst=None, @@ -302,7 +291,6 @@ def __init__( monotonic_cst=monotonic_cst, ) - self.feature_combinations = feature_combinations self.feature_set_ends = feature_set_ends self.apply_max_features_per_feature_set = apply_max_features_per_feature_set self._max_features_arr = None @@ -362,7 +350,7 @@ def _build_tree( monotonic_cst = None _, n_features = X.shape - self.feature_combinations_ = 1 + self._feature_combinations_ = 1 # Build tree criterion = self.criterion @@ -485,7 +473,7 @@ def _build_tree( min_weight_leaf, random_state, monotonic_cst, - self.feature_combinations_, + self._feature_combinations_, self.feature_set_ends_, self.n_feature_sets_, self.max_features_per_set_, @@ -584,7 +572,6 @@ def _inheritable_fitted_attribute(self): """ return [ "max_features_", - "feature_combinations_", "feature_set_ends_", "n_feature_sets_", "n_features_in_set_", diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 3f17b8c6b..ea5187c09 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -164,15 +164,8 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # XXX: This splitter is experimental. Expect changes frequently. -cdef class MultiViewObliqueSplitter(BestObliqueSplitter): - cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set - cdef intp_t n_feature_sets # the number of feature sets is the length of feature_set_ends + 1 - - # whether or not to uniformly sample feature-sets into each projection vector - # if True, then sample from each feature set for each projection vector - cdef bint uniform_sampling - - cdef vector[vector[intp_t]] multi_indices_to_sample +cdef class MultiViewObliqueSplitter(MultiViewSplitter): + cdef const intp_t[:] n_non_zeros_per_set # the number of non-zero features in each feature set cdef void sample_proj_mat( self, diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 23b6e722d..e6d978658 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -753,34 +753,26 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # 01: Algorithm samples features from each set equally with the same number # of candidates, but if one feature set is exhausted, then that one is no longer sampled - cdef intp_t finished_feature_set_count = 0 - cdef bint finished_feature_sets = False cdef intp_t i, j proj_i = 0 - if self.max_features_per_set is None: - while proj_i < self.max_features and not finished_feature_sets: - finished_feature_sets = False - finished_feature_set_count = 0 - - # sample from a feature set - for idx in range(self.n_feature_sets): - # indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = self.multi_indices_to_sample[idx].size() - - # Note: a temporary variable must not be used, else a copy will be made - if proj_i == 0: - for i in range(0, self.multi_indices_to_sample[idx].size() - 1): - j = rand_int(i + 1, grid_size, random_state) - self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = \ - self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] - - # keep track of which feature-sets are exhausted - if ifeature >= grid_size: - finished_feature_set_count += 1 - continue + # 02: Algorithm samples a different number features from each set, but considers + # each feature-set equally + while proj_i < self.max_features: + # sample from a feature set + for idx in range(self.n_feature_sets): + # get the max-features for this feature-set + max_features = self.max_features_per_set[idx] + grid_size = self.multi_indices_to_sample[idx].size() + # Note: a temporary variable must not be used, else a copy will be made + for i in range(0, self.multi_indices_to_sample[idx].size() - 1): + j = rand_int(i + 1, grid_size, random_state) + self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = \ + self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] + + for ifeature in range(max_features): # sample random feature in this set feat_i = self.multi_indices_to_sample[idx][ifeature] @@ -793,45 +785,11 @@ cdef class MultiViewSplitter(BestObliqueSplitter): proj_i += 1 if proj_i >= self.max_features: break + if proj_i >= self.max_features: + break - if finished_feature_set_count == self.n_feature_sets: - finished_feature_sets = True - ifeature += 1 - # 02: Algorithm samples a different number features from each set, but considers - # each feature-set equally - else: - while proj_i < self.max_features: - # sample from a feature set - for idx in range(self.n_feature_sets): - # get the max-features for this feature-set - max_features = self.max_features_per_set[idx] - - grid_size = self.multi_indices_to_sample[idx].size() - # Note: a temporary variable must not be used, else a copy will be made - for i in range(0, self.multi_indices_to_sample[idx].size() - 1): - j = rand_int(i + 1, grid_size, random_state) - self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = \ - self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] - - for ifeature in range(max_features): - # sample random feature in this set - feat_i = self.multi_indices_to_sample[idx][ifeature] - - # here, axis-aligned splits are entirely weights of 1 - weight = 1 # if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - proj_i += 1 - if proj_i >= self.max_features: - break - if proj_i >= self.max_features: - break - -# XXX: not used right now -cdef class MultiViewObliqueSplitter(BestObliqueSplitter): +cdef class MultiViewObliqueSplitter(MultiViewSplitter): def __cinit__( self, Criterion criterion, @@ -843,64 +801,23 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): float64_t feature_combinations, const intp_t[:] feature_set_ends, intp_t n_feature_sets, - bint uniform_sampling, + const intp_t[:] max_features_per_set, *argv ): self.feature_set_ends = feature_set_ends - self.uniform_sampling = uniform_sampling # infer the number of feature sets self.n_feature_sets = n_feature_sets - def __reduce__(self): - """Enable pickling the splitter.""" - return (type(self), - ( - self.criterion, - self.max_features, - self.min_samples_leaf, - self.min_weight_leaf, - self.random_state, - self.monotonic_cst.base if self.monotonic_cst is not None else None, - self.feature_combinations, - self.feature_set_ends, - self.n_feature_sets, - self.uniform_sampling, - ), self.__getstate__()) - - cdef int init( - self, - object X, - const float64_t[:, ::1] y, - const float64_t[:] sample_weight, - const unsigned char[::1] missing_values_in_feature_mask, - ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) - - self.X = X - - # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - - cdef intp_t i_feature = 0 - cdef intp_t feature_set_begin = 0 - cdef intp_t size_of_feature_set - cdef intp_t ifeat = 0 - cdef intp_t iproj = 0 - while iproj < self.max_features: - for i_feature in range(self.n_feature_sets): - size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - - for ifeat in range(size_of_feature_set): - self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin + (iproj * self.n_features)) - iproj += 1 - if iproj >= self.max_features: - break - if iproj >= self.max_features: - break + # replaces usage of max_features + self.max_features_per_set = max_features_per_set - feature_set_begin = self.feature_set_ends[i_feature] - return 0 + # compute # of non-zeros expected on average per feature set + cdef intp_t[:] n_non_zeros_per_set = np.zeros(self.n_feature_sets, dtype=np.intp) + cdef intp_t i + for i in range(self.n_feature_sets): + n_non_zeros_per_set[i] = (self.max_features_per_set[i] * self.feature_combinations) + self.n_non_zeros_per_set = n_non_zeros_per_set cdef void sample_proj_mat( self, @@ -913,7 +830,6 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): but now also uniformly samples features from each feature set. """ cdef intp_t n_features = self.n_features - cdef intp_t n_non_zeros = self.n_non_zeros cdef UINT32_t* random_state = &self.rand_r_state cdef intp_t i, j, feat_i, proj_i, rand_vec_index @@ -923,92 +839,47 @@ cdef class MultiViewObliqueSplitter(BestObliqueSplitter): cdef vector[intp_t] indices_to_sample cdef intp_t grid_size - # compute the number of features in each feature set - cdef intp_t n_features_in_set - # keep track of the beginning and ending indices of each feature set cdef intp_t feature_set_begin, feature_set_end, idx feature_set_begin = 0 - # keep track of number of features sampled relative to n_non_zeros - cdef intp_t ifeature = 0 - - if self.uniform_sampling: - # 01: This algorithm samples features from each feature set uniformly and combines them - # into one sparse projection vector. - while ifeature < n_non_zeros: - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - # sample a n_non_zeros matrix for each feature set, which proceeds by: - # - sample 'n_non_zeros' in a mtry X n_features projection matrix - # - which consists of +/- 1's chosen at a 1/2s rate - # for i in range(0, n_non_zeros_per_set): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[0] - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features - - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - - # the new beginning is the previous end - feature_set_begin = feature_set_end - - ifeature += 1 - else: - # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates - # them independently. - feature_set_begin = 0 + # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates + # them independently. + feature_set_begin = 0 - # sample from a feature set - for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - n_features_in_set = feature_set_end - feature_set_begin + # sample from a feature set using linear combinations among the two sets + for idx in range(self.n_feature_sets): + feature_set_end = self.feature_set_ends[idx] - # indices to sample is a 1D-index array of size (max_features * n_features_in_set) - # which is Fisher-Yates shuffled to sample random features in each feature set - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() + # indices to sample is a 1D-index array of size (max_features * n_features_in_set) + # which is Fisher-Yates shuffled to sample random features in each feature set + indices_to_sample = self.multi_indices_to_sample[idx] + grid_size = indices_to_sample.size() - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] + # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates + for i in range(0, grid_size): + j = rand_int(0, grid_size, random_state) + indices_to_sample[j], indices_to_sample[i] = \ + indices_to_sample[i], indices_to_sample[j] - for i in range(0, n_non_zeros): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[i] + # we want "n_non_zeros / K" for this feature set over K feature sets + for i in range(0, self.n_non_zeros_per_set[idx]): + # get the next index from the shuffled index array + rand_vec_index = indices_to_sample[i] - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features - feat_i = rand_vec_index % n_features + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features - # sample a random weight - weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero - proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - # the new beginning is the previous end - feature_set_begin = feature_set_end + # the new beginning is the previous end + feature_set_begin = feature_set_end cdef class MultiViewSplitterTester(MultiViewSplitter): From 923c1711c280516b48ca9ae7aca2bbd5dacff919 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 14 Mar 2024 13:11:49 -0400 Subject: [PATCH 02/26] Clean up other unused kwarg path Signed-off-by: Adam Li --- sktree/ensemble/_multiview.py | 8 -- sktree/stats/tests/test_forestht.py | 1 - sktree/tests/test_multiview_forest.py | 1 - sktree/tree/_multiview.py | 112 ++++++++++---------------- sktree/tree/tests/test_multiview.py | 26 ++---- 5 files changed, 47 insertions(+), 101 deletions(-) diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py index 8f9c52971..828212335 100644 --- a/sktree/ensemble/_multiview.py +++ b/sktree/ensemble/_multiview.py @@ -165,11 +165,6 @@ class MultiViewRandomForestClassifier( next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, then this will assume that there is only one feature set. - apply_max_features_per_feature_set : bool, default=False - Whether to apply sampling per feature set, where ``max_features`` is applied - to each feature-set. If ``False``, then sampling - is applied over the entire feature space. - Attributes ---------- estimators_ : list of sktree.tree.ObliqueDecisionTreeClassifier @@ -261,7 +256,6 @@ def __init__( class_weight=None, max_samples=None, feature_set_ends=None, - apply_max_features_per_feature_set=False, ): super().__init__( estimator=MultiViewDecisionTreeClassifier(), @@ -277,7 +271,6 @@ def __init__( "min_impurity_decrease", "random_state", "feature_set_ends", - "apply_max_features_per_feature_set", ), bootstrap=bootstrap, oob_score=oob_score, @@ -294,7 +287,6 @@ def __init__( self.min_samples_leaf = min_samples_leaf self.max_features = max_features self.feature_set_ends = feature_set_ends - self.apply_max_features_per_feature_set = apply_max_features_per_feature_set # unused by oblique forests self.min_weight_fraction_leaf = min_weight_fraction_leaf diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 091eff99a..af4193ebf 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -710,7 +710,6 @@ def test_comight_repeated_feature_sets(): tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=feature_set_ends, max_features=0.3, - apply_max_features_per_feature_set=True, ), ), test_size=0.2, diff --git a/sktree/tests/test_multiview_forest.py b/sktree/tests/test_multiview_forest.py index 95119b580..da168bbba 100644 --- a/sktree/tests/test_multiview_forest.py +++ b/sktree/tests/test_multiview_forest.py @@ -150,7 +150,6 @@ def test_three_view_dataset(n_views, max_features): clf = MultiViewRandomForestClassifier( random_state=seed, feature_set_ends=feature_set_ends, - apply_max_features_per_feature_set=True, max_features=max_features, n_estimators=n_estimators, ) diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index a01ba986c..81aa24963 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -178,11 +178,6 @@ class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, then this will assume that there is only one feature set. - apply_max_features_per_feature_set : bool, default=False - Whether to apply sampling per feature set, where ``max_features`` is applied - to each feature-set. If ``False``, then sampling - is applied over the entire feature space. - Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -243,7 +238,6 @@ class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): _parameter_constraints = { **DecisionTreeClassifier._parameter_constraints, "feature_set_ends": ["array-like", None], - "apply_max_features_per_feature_set": ["boolean"], } _parameter_constraints.pop("max_features") _parameter_constraints["max_features"] = [ @@ -272,7 +266,6 @@ def __init__( store_leaf_values=False, monotonic_cst=None, feature_set_ends=None, - apply_max_features_per_feature_set=False, ): super().__init__( criterion=criterion, @@ -292,7 +285,6 @@ def __init__( ) self.feature_set_ends = feature_set_ends - self.apply_max_features_per_feature_set = apply_max_features_per_feature_set self._max_features_arr = None def _build_tree( @@ -383,7 +375,6 @@ def _build_tree( if isinstance(self._max_features_arr, (Integral, Real, str, type(None))): max_features_arr_ = [self._max_features_arr] * self.n_feature_sets_ - stratify_mtry_per_view = self.apply_max_features_per_feature_set else: if not isinstance(self._max_features_arr, (list, np.ndarray)): raise ValueError( @@ -396,74 +387,53 @@ def _build_tree( f"got {len(self.max_features)}" ) max_features_arr_ = self._max_features_arr - stratify_mtry_per_view = True self.n_features_in_set_ = [] - if stratify_mtry_per_view: - # XXX: experimental - # we can replace max_features_ here based on whether or not uniform logic over - # feature sets - max_features_per_set = [] - n_features_in_prev = 0 - for idx in range(self.n_feature_sets_): - max_features = max_features_arr_[idx] - - n_features_in_ = self.feature_set_ends_[idx] - n_features_in_prev - n_features_in_prev += n_features_in_ - self.n_features_in_set_.append(n_features_in_) - if isinstance(max_features, str): - if max_features == "sqrt": - max_features = max(1, math.ceil(np.sqrt(n_features_in_))) - elif max_features == "log2": - max_features = max(1, math.ceil(np.log2(n_features_in_))) - elif max_features is None: - max_features = n_features_in_ - elif isinstance(max_features, numbers.Integral): - max_features = max_features - else: # float - if max_features > 0.0: - max_features = max(1, math.ceil(max_features * n_features_in_)) - else: - max_features = 0 - - if max_features > n_features_in_: - raise ValueError( - f"max_features must be less than or equal to " - f"the number of features in feature set {idx}: {n_features_in_}, but " - f"max_features = {max_features} when applying sampling" - f"per feature set." - ) - - max_features_per_set.append(max_features) - self.max_features_ = np.sum(max_features_per_set) - if self.max_features_ > n_features: - raise ValueError( - "max_features is greater than the number of features: " - f"{max_features} > {n_features}." - "This should not be possible. Please submit a bug report." - ) - self.max_features_per_set_ = np.asarray(max_features_per_set, dtype=np.intp) - # the total number of features to sample per split - self.max_features_ = np.sum(self.max_features_per_set_) - else: - self.max_features_per_set_ = None - self.max_features = self._max_features_arr - if isinstance(self.max_features, str): - if self.max_features == "sqrt": - max_features = max(1, int(np.sqrt(self.n_features_in_))) - elif self.max_features == "log2": - max_features = max(1, int(np.log2(self.n_features_in_))) - elif self.max_features is None: - max_features = self.n_features_in_ - elif isinstance(self.max_features, numbers.Integral): - max_features = self.max_features + # XXX: experimental + # we can replace max_features_ here based on whether or not uniform logic over + # feature sets + max_features_per_set = [] + n_features_in_prev = 0 + for idx in range(self.n_feature_sets_): + max_features = max_features_arr_[idx] + + n_features_in_ = self.feature_set_ends_[idx] - n_features_in_prev + n_features_in_prev += n_features_in_ + self.n_features_in_set_.append(n_features_in_) + if isinstance(max_features, str): + if max_features == "sqrt": + max_features = max(1, math.ceil(np.sqrt(n_features_in_))) + elif max_features == "log2": + max_features = max(1, math.ceil(np.log2(n_features_in_))) + elif max_features is None: + max_features = n_features_in_ + elif isinstance(max_features, numbers.Integral): + max_features = max_features else: # float - if self.max_features > 0.0: - max_features = max(1, int(self.max_features * self.n_features_in_)) + if max_features > 0.0: + max_features = max(1, math.ceil(max_features * n_features_in_)) else: max_features = 0 - self.max_features_ = max_features + if max_features > n_features_in_: + raise ValueError( + f"max_features must be less than or equal to " + f"the number of features in feature set {idx}: {n_features_in_}, but " + f"max_features = {max_features} when applying sampling" + f"per feature set." + ) + + max_features_per_set.append(max_features) + self.max_features_ = np.sum(max_features_per_set) + if self.max_features_ > n_features: + raise ValueError( + "max_features is greater than the number of features: " + f"{max_features} > {n_features}." + "This should not be possible. Please submit a bug report." + ) + self.max_features_per_set_ = np.asarray(max_features_per_set, dtype=np.intp) + # the total number of features to sample per split + self.max_features_ = np.sum(self.max_features_per_set_) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py index 419ca378d..38541197b 100644 --- a/sktree/tree/tests/test_multiview.py +++ b/sktree/tree/tests/test_multiview.py @@ -102,7 +102,6 @@ def test_multiview_errors(): random_state=seed, feature_set_ends=[3, 5], max_features=6, - apply_max_features_per_feature_set=True, ) with pytest.raises(ValueError, match="the number of features in feature set"): clf.fit(X, y) @@ -117,7 +116,6 @@ def test_multiview_separate_feature_set_sampling_sets_attributes(): random_state=seed, feature_set_ends=[6, 10], max_features=0.5, - apply_max_features_per_feature_set=True, ) clf.fit(X, y) @@ -130,7 +128,6 @@ def test_multiview_separate_feature_set_sampling_sets_attributes(): random_state=seed, feature_set_ends=[9, 13], max_features="sqrt", - apply_max_features_per_feature_set=True, ) clf.fit(X, y) assert_array_equal(clf.max_features_per_set_, [3, 2]) @@ -142,7 +139,6 @@ def test_multiview_separate_feature_set_sampling_sets_attributes(): random_state=seed, feature_set_ends=[5, 9], max_features="sqrt", - apply_max_features_per_feature_set=True, ) clf.fit(X, y) assert_array_equal(clf.max_features_per_set_, [3, 2]) @@ -160,7 +156,6 @@ def test_at_least_one_feature_per_view_is_sampled(): random_state=seed, feature_set_ends=[1, 2, 4, 10], max_features=0.4, - apply_max_features_per_feature_set=True, ) clf.fit(X, y) @@ -178,7 +173,6 @@ def test_multiview_separate_feature_set_sampling_is_consistent(): random_state=seed, feature_set_ends=[1, 3, 6, 10], max_features=[1, 2, 2, 3], - apply_max_features_per_feature_set=True, ) clf.fit(X, y) @@ -192,15 +186,13 @@ def test_multiview_separate_feature_set_sampling_is_consistent(): random_state=seed, feature_set_ends=[1, 3, 6, 10], max_features=[1, 2, 2, 3], - apply_max_features_per_feature_set=False, ) other_clf.fit(X, y) assert_array_equal(other_clf.tree_.value, clf.tree_.value) -@pytest.mark.parametrize("stratify_mtry_per_view", [True, False]) -def test_separate_mtry_per_feature_set(stratify_mtry_per_view): +def test_separate_mtry_per_feature_set(): """Test that multiview decision tree can sample different numbers of features per view. Sets the ``max_feature`` argument as an array-like. @@ -213,7 +205,6 @@ def test_separate_mtry_per_feature_set(stratify_mtry_per_view): random_state=seed, feature_set_ends=[1, 2, 4, 10], max_features=[0.4, 0.5, 0.6, 0.7], - apply_max_features_per_feature_set=stratify_mtry_per_view, ) clf.fit(X, y) @@ -225,7 +216,6 @@ def test_separate_mtry_per_feature_set(stratify_mtry_per_view): random_state=seed, feature_set_ends=[1, 2, 4, 10], max_features=[1, 1, 1, 1.0], - apply_max_features_per_feature_set=stratify_mtry_per_view, ) clf.fit(X, y) assert_array_equal(clf.max_features_per_set_, [1, 1, 1, 6]) @@ -236,14 +226,9 @@ def test_separate_mtry_per_feature_set(stratify_mtry_per_view): random_state=seed, feature_set_ends=[1, 2, 4, 10], max_features=1.0, - apply_max_features_per_feature_set=stratify_mtry_per_view, ) clf.fit(X, y) - if stratify_mtry_per_view: - assert_array_equal(clf.max_features_per_set_, [1, 1, 2, 6]) - else: - assert clf.max_features_per_set_ is None - assert clf.max_features_ == 10 + assert_array_equal(clf.max_features_per_set_, [1, 1, 2, 6]) assert clf.max_features_ == 10, np.sum(clf.max_features_per_set_) @@ -262,9 +247,10 @@ def test_multiview_without_feature_view_stratification(): random_state=seed, feature_set_ends=[497, 500], max_features=0.3, - apply_max_features_per_feature_set=False, ) clf.fit(X, y) - assert clf.max_features_per_set_ is None - assert clf.max_features_ == 500 * clf.max_features, clf.max_features_ + assert_array_equal(clf.max_features_per_set_, [150, 1]), clf.max_features_per_set_ + assert clf.max_features_ == math.ceil(497.0 * clf.max_features) + math.ceil( + 3 * clf.max_features + ) From 500dca969defaf97c85818d138f587db2e301c15 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 14 Mar 2024 13:13:57 -0400 Subject: [PATCH 03/26] add changelog Signed-off-by: Adam Li --- doc/whats_new/v0.8.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 0a7ba4f58..183d217df 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -13,6 +13,12 @@ Version 0.8 Changelog --------- +- |API| :class:`sktree.tree.MultiViewDecisionTreeClassifier` do not have the + ``apply_max_features_per_feature_set`` argument anymore. Instead, the + ``max_features`` argument is used to control the number of features to + consider when looking for the best split within each feature set explicitly. + By `Adam Li`_ :pr:`#247`. + Code and Documentation Contributors ----------------------------------- From 954c6fc49853e5d50884b46d452604de0ab930f2 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 15 Mar 2024 20:52:16 -0400 Subject: [PATCH 04/26] Fix examples Signed-off-by: Adam Li --- .../plot_MI_imbalanced_hyppo_testing.py | 1 - .../plot_co_MIGHT_alternative.py | 1 - .../hypothesis_testing/plot_co_MIGHT_null.py | 2 - .../plot_multiview_axis_aligned_splitter.py | 3 - sktree/tree/__init__.py | 3 +- sktree/tree/_multiview.py | 468 ++++++++++++++++++ sktree/tree/_oblique_splitter.pyx | 2 + sktree/tree/tests/test_all_trees.py | 10 +- 8 files changed, 479 insertions(+), 11 deletions(-) diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index 95c5341ae..de3473c4d 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -130,7 +130,6 @@ def make_multiview_classification( max_features=max_features, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=n_features_views, - apply_max_features_per_feature_set=True, ), random_state=seed, honest_fraction=0.5, diff --git a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py b/examples/hypothesis_testing/plot_co_MIGHT_alternative.py index fd33c335e..97c7b6611 100644 --- a/examples/hypothesis_testing/plot_co_MIGHT_alternative.py +++ b/examples/hypothesis_testing/plot_co_MIGHT_alternative.py @@ -112,7 +112,6 @@ max_features=max_features, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, ), random_state=seed, honest_fraction=0.5, diff --git a/examples/hypothesis_testing/plot_co_MIGHT_null.py b/examples/hypothesis_testing/plot_co_MIGHT_null.py index 2e6325cd1..b6f2f9346 100644 --- a/examples/hypothesis_testing/plot_co_MIGHT_null.py +++ b/examples/hypothesis_testing/plot_co_MIGHT_null.py @@ -84,7 +84,6 @@ max_features=max_features, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, ), random_state=seed, honest_fraction=0.5, @@ -203,7 +202,6 @@ max_features=max_features, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=n_features_ends, - apply_max_features_per_feature_set=True, ), random_state=seed, honest_fraction=0.5, diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py index 064000153..00b8c0280 100644 --- a/examples/splitters/plot_multiview_axis_aligned_splitter.py +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -127,9 +127,6 @@ # more than the second feature set, we can specify ``max_features_per_set`` as follows: # ``max_features_per_set = [3, 1]``. This will sample from the first feature set three times # and the second feature set once. -# -# .. note:: In practice, this is controlled by the ``apply_max_features_per_feature_set`` parameter -# in :class:`sktree.tree.MultiViewDecisionTreeClassifier`. max_features_per_set_ = np.array([1, 2, 3], dtype=int) max_features = np.sum(max_features_per_set_) diff --git a/sktree/tree/__init__.py b/sktree/tree/__init__.py index 797338ac3..dc5465a60 100644 --- a/sktree/tree/__init__.py +++ b/sktree/tree/__init__.py @@ -15,7 +15,7 @@ UnsupervisedObliqueDecisionTree, ) from ._honest_tree import HonestTreeClassifier -from ._multiview import MultiViewDecisionTreeClassifier +from ._multiview import MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier from ._neighbors import compute_forest_similarity_matrix __all__ = [ @@ -34,4 +34,5 @@ "ExtraTreeClassifier", "ExtraTreeRegressor", "MultiViewDecisionTreeClassifier", + "MultiViewObliqueDecisionTreeClassifier", ] diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 81aa24963..8be7ed74c 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -35,6 +35,10 @@ "best": _oblique_splitter.MultiViewSplitter, } +OBLIQUE_DENSE_SPLITTERS = { + "best": _oblique_splitter.MultiViewObliqueSplitter, +} + class MultiViewDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): """A multi-view axis-aligned decision tree classifier. @@ -547,3 +551,467 @@ def _inheritable_fitted_attribute(self): "n_features_in_set_", "max_features_per_set_", ] + + +class MultiViewObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassifier): + """A multi-view OBLIQUE decision tree classifier. + + This is an experimental feature that applies an oblique decision tree to + multiple feature-sets concatenated across columns in ``X``. + + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + + splitter : {"best"}, default="best" + The strategy used to choose the split at each node. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : array-like, int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + If array-like, then `max_features` is the number of features to consider + for each feature set following the same logic as above, where + ``n_features`` is the number of features in the respective feature set. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + ccp_alpha : non-negative float, default=0.0 + Not used. + + store_leaf_values : bool, default=False + Whether to store the leaf values. + + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonicity constraint to enforce on each feature. + - 1: monotonic increase + - 0: no constraint + - -1: monotonic decrease + + Not used. + + feature_set_ends : array-like of int of shape (n_feature_sets,), default=None + The indices of the end of each feature set. For example, if the first + feature set is the first 10 features, and the second feature set is the + next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, + then this will assume that there is only one feature set. + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + max_features_ : int + The inferred value of max_features. + + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + + feature_set_ends_ : array-like of int of shape (n_feature_sets,) + The indices of the end of each feature set. + + n_feature_sets_ : int + The number of feature sets. + + max_features_per_set_ : array-like of int of shape (n_feature_sets,) + The number of features to sample per feature set. If ``None``, then + ``max_features`` is applied to the entire feature space. + + See Also + -------- + sklearn.tree.DecisionTreeClassifier : An axis-aligned decision tree classifier. + """ + + tree_type = "oblique" + + _parameter_constraints = { + **DecisionTreeClassifier._parameter_constraints, + "feature_set_ends": ["array-like", None], + "feature_combinations": [ + Interval(Real, 1.0, None, closed="left"), + None, + ], + } + _parameter_constraints.pop("max_features") + _parameter_constraints["max_features"] = [ + Interval(Integral, 1, None, closed="left"), + Interval(RealNotInt, 0.0, 1.0, closed="right"), + StrOptions({"sqrt", "log2"}), + "array-like", + None, + ] + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + ccp_alpha=0.0, + store_leaf_values=False, + monotonic_cst=None, + feature_set_ends=None, + feature_combinations=None, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ccp_alpha=ccp_alpha, + store_leaf_values=store_leaf_values, + monotonic_cst=monotonic_cst, + ) + + self.feature_set_ends = feature_set_ends + self.feature_combinations = feature_combinations + self._max_features_arr = None + + def _build_tree( + self, + X, + y, + sample_weight, + missing_values_in_feature_mask, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + monotonic_cst = None + _, n_features = X.shape + + self.feature_combinations_ = ( + self.feature_combinations if self.feature_combinations is not None else 1.5 + ) + + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + if self.feature_set_ends is None: + self.feature_set_ends_ = np.asarray([n_features], dtype=np.intp) + else: + self.feature_set_ends_ = np.atleast_1d(self.feature_set_ends).astype(np.intp) + self.n_feature_sets_ = len(self.feature_set_ends_) + if self.feature_set_ends_[-1] != n_features: + raise ValueError( + f"The last feature set end must be equal to the number of features, " + f"{n_features}, but got {self.feature_set_ends_[-1]}." + ) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + + if isinstance(self._max_features_arr, (Integral, Real, str, type(None))): + max_features_arr_ = [self._max_features_arr] * self.n_feature_sets_ + else: + if not isinstance(self._max_features_arr, (list, np.ndarray)): + raise ValueError( + f"max_features must be an array-like, int, float, str, or None; " + f"got {type(self._max_features_arr)}" + ) + if len(self._max_features_arr) != self.n_feature_sets_: + raise ValueError( + f"max_features must be an array-like of length {self.n_feature_sets_}; " + f"got {len(self.max_features)}" + ) + max_features_arr_ = self._max_features_arr + + self.n_features_in_set_ = [] + # XXX: experimental + # we can replace max_features_ here based on whether or not uniform logic over + # feature sets + max_features_per_set = [] + n_features_in_prev = 0 + for idx in range(self.n_feature_sets_): + max_features = max_features_arr_[idx] + + n_features_in_ = self.feature_set_ends_[idx] - n_features_in_prev + n_features_in_prev += n_features_in_ + self.n_features_in_set_.append(n_features_in_) + if isinstance(max_features, str): + if max_features == "sqrt": + max_features = max(1, math.ceil(np.sqrt(n_features_in_))) + elif max_features == "log2": + max_features = max(1, math.ceil(np.log2(n_features_in_))) + elif max_features is None: + max_features = n_features_in_ + elif isinstance(max_features, numbers.Integral): + max_features = max_features + else: # float + if max_features > 0.0: + max_features = max(1, math.ceil(max_features * n_features_in_)) + else: + max_features = 0 + + if max_features > n_features_in_: + raise ValueError( + f"max_features must be less than or equal to " + f"the number of features in feature set {idx}: {n_features_in_}, but " + f"max_features = {max_features} when applying sampling" + f"per feature set." + ) + + max_features_per_set.append(max_features) + self.max_features_ = np.sum(max_features_per_set) + if self.max_features_ > n_features: + raise ValueError( + "max_features is greater than the number of features: " + f"{max_features} > {n_features}." + "This should not be possible. Please submit a bug report." + ) + self.max_features_per_set_ = np.asarray(max_features_per_set, dtype=np.intp) + # the total number of features to sample per split + self.max_features_ = np.sum(self.max_features_per_set_) + + if not isinstance(self.splitter, ObliqueSplitter): + splitter = OBLIQUE_DENSE_SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + self.feature_combinations_, + self.feature_set_ends_, + self.n_feature_sets_, + self.max_features_per_set_, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + self.builder_ = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + self.builder_ = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + self.builder_.build(self.tree_, X, y, sample_weight, None) + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + @property + def _inheritable_fitted_attribute(self): + """Define additional attributes to pass onto a parent meta tree-estimator. + + Used for passing parameters to HonestTreeClassifier. + """ + return [ + "max_features_", + "feature_set_ends_", + "n_feature_sets_", + "n_features_in_set_", + "max_features_per_set_", + "feature_combinations_", + ] diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index e6d978658..a3b9c6771 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -789,6 +789,8 @@ cdef class MultiViewSplitter(BestObliqueSplitter): break +# TODO: need to check segfault for multiview oblique splitter +# REBUILD WITH BOUNDS CHECK cdef class MultiViewObliqueSplitter(MultiViewSplitter): def __cinit__( self, diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py index 66a9ea307..c5a7708b5 100644 --- a/sktree/tree/tests/test_all_trees.py +++ b/sktree/tree/tests/test_all_trees.py @@ -2,13 +2,15 @@ import numpy as np import pytest from numpy.testing import assert_almost_equal, assert_array_equal -from sklearn.base import is_classifier +from sklearn.base import is_classifier, is_regressor from sklearn.datasets import make_blobs from sklearn.tree._tree import TREE_LEAF from sktree.tree import ( ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, + MultiViewDecisionTreeClassifier, + MultiViewObliqueDecisionTreeClassifier, ObliqueDecisionTreeClassifier, ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeClassifier, @@ -26,6 +28,8 @@ PatchObliqueDecisionTreeClassifier, UnsupervisedDecisionTree, UnsupervisedObliqueDecisionTree, + MultiViewDecisionTreeClassifier, + MultiViewObliqueDecisionTreeClassifier, ] @@ -121,7 +125,7 @@ def assert_tree_equal(d, s, message): @pytest.mark.parametrize( "TREE", - [ObliqueDecisionTreeClassifier, UnsupervisedDecisionTree, UnsupervisedObliqueDecisionTree], + ALL_TREES, ) def test_tree_deserialization_from_read_only_buffer(tmpdir, TREE): """Check that Trees can be deserialized with read only buffers. @@ -131,7 +135,7 @@ def test_tree_deserialization_from_read_only_buffer(tmpdir, TREE): pickle_path = str(tmpdir.join("clf.joblib")) clf = TREE(random_state=0) - if is_classifier(TREE): + if is_classifier(TREE) or is_regressor(TREE): clf.fit(X_small, y_small) else: clf.fit(X_small) From e6ea30bef2966faffb97edb5721790a98746a327 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 24 Jun 2024 17:23:16 -0400 Subject: [PATCH 05/26] Almost working Signed-off-by: Adam Li --- sktree/tree/_oblique_splitter.pyx | 32 ++++++++++++++++++------------- test_mvoblique_tree.py | 1 + 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index f9e5a8edd..8dd6ea704 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -331,6 +331,9 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) + with gil: + print("Finished sampling projection matrix") + # For every vector in the projection matrix for feat_i in range(max_features): # Projection vector has no nonzeros @@ -724,15 +727,18 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # create a helper array for allowing efficient Fisher-Yates self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) + # create a helper array for allowing efficient Fisher-Yates cdef intp_t i_feature = 0 cdef intp_t feature_set_begin = 0 - cdef intp_t size_of_feature_set + cdef intp_t size_of_feature_set, size_of_sampling cdef intp_t ifeat = 0 for i_feature in range(self.n_feature_sets): size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - for ifeat in range(size_of_feature_set): - self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) + size_of_sampling = self.max_features_per_set[i_feature] * size_of_feature_set + # push an index corresponding to each element we want to sample + for ifeat in range(size_of_sampling): + self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) feature_set_begin = self.feature_set_ends[i_feature] return 0 @@ -826,8 +832,6 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): for i in range(self.n_feature_sets): n_non_zeros_per_set[i] = (self.max_features_per_set[i] * self.feature_combinations) self.n_non_zeros_per_set = n_non_zeros_per_set - with gil: - print("Initialized") cdef void sample_proj_mat( self, @@ -850,19 +854,14 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): cdef intp_t grid_size # keep track of the beginning and ending indices of each feature set - cdef intp_t feature_set_begin, feature_set_end, idx - feature_set_begin = 0 + cdef intp_t idx # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates # them independently. - feature_set_begin = 0 - with gil: - print("Starting to sample projection matrix") + print("Starting to sample projection matrix", self.n_feature_sets) # sample from a feature set using linear combinations among the two sets for idx in range(self.n_feature_sets): - feature_set_end = self.feature_set_ends[idx] - # indices to sample is a 1D-index array of size (max_features * n_features_in_set) # which is Fisher-Yates shuffled to sample random features in each feature set indices_to_sample = self.multi_indices_to_sample[idx] @@ -874,6 +873,11 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): indices_to_sample[j], indices_to_sample[i] = \ indices_to_sample[i], indices_to_sample[j] + with gil: + print(idx, "Finished fisher yates...") + print(len(self.n_non_zeros_per_set), len(self.max_features_per_set), len(self.multi_indices_to_sample)) + print(len(indices_to_sample), grid_size, self.n_non_zeros_per_set[idx]) + # we want "n_non_zeros / K" for this feature set over K feature sets for i in range(0, self.n_non_zeros_per_set[idx]): # get the next index from the shuffled index array @@ -887,11 +891,13 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): # sample a random weight weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + # with gil: + # print(i, proj_i, feat_i) proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero # the new beginning is the previous end - feature_set_begin = feature_set_end + # feature_set_begin = feature_set_end cdef class MultiViewSplitterTester(MultiViewSplitter): diff --git a/test_mvoblique_tree.py b/test_mvoblique_tree.py index 02c81d395..41fd7c357 100644 --- a/test_mvoblique_tree.py +++ b/test_mvoblique_tree.py @@ -52,6 +52,7 @@ feature_set_ends=[n_features_1, X.shape[1]], max_features=0.3, ) +print(X.shape) clf.fit(X, y) assert ( accuracy_score(y, clf.predict(X)) == 1.0 From 121868020f402bca68c588a458827a1882594bcf Mon Sep 17 00:00:00 2001 From: Adam Li Date: Mon, 24 Jun 2024 17:41:04 -0400 Subject: [PATCH 06/26] Fix changelog Signed-off-by: Adam Li --- doc/whats_new/v0.8.rst | 5 ----- doc/whats_new/v0.9.rst | 6 +++++- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/whats_new/v0.8.rst b/doc/whats_new/v0.8.rst index 521c80ecd..a0949489d 100644 --- a/doc/whats_new/v0.8.rst +++ b/doc/whats_new/v0.8.rst @@ -32,11 +32,6 @@ Changelog estimated on oob samples were biased when there was a low number of samples due to imbalance in the classes when ``bootstrap=True``. By `Adam Li`_ (:pr:`#283`) -- |API| :class:`sktree.tree.MultiViewDecisionTreeClassifier` do not have the - ``apply_max_features_per_feature_set`` argument anymore. Instead, the - ``max_features`` argument is used to control the number of features to - consider when looking for the best split within each feature set explicitly. - By `Adam Li`_ :pr:`#247`. Code and Documentation Contributors ----------------------------------- diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index 9c5ffb3b2..696929b5e 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -13,7 +13,11 @@ Version 0.9 Changelog --------- -- +- |API| :class:`sktree.tree.MultiViewDecisionTreeClassifier` do not have the + ``apply_max_features_per_feature_set`` argument anymore. Instead, the + ``max_features`` argument is used to control the number of features to + consider when looking for the best split within each feature set explicitly. + By `Adam Li`_ :pr:`#247`. Code and Documentation Contributors ----------------------------------- From bba2e8fb9916d61de40bf0f1645b66c4fcf454e9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 26 Jun 2024 08:59:53 -0400 Subject: [PATCH 07/26] Update submodule Signed-off-by: Adam Li --- sktree/_lib/sklearn_fork | 2 +- sktree/tree/_oblique_splitter.pxd | 12 ++-- sktree/tree/_oblique_splitter.pyx | 30 ++++----- sktree/tree/tests/test_all_trees.py | 5 +- test_mvoblique_tree.py | 98 +++++++++++++++++------------ 5 files changed, 87 insertions(+), 60 deletions(-) diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 74b2e699a..d455aa16e 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 74b2e699a2607b190ce6fc49b7625231023989c0 +Subproject commit d455aa16ee9cc42ce342dd07d9b94db117783fcc diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index a7a2dcfd7..fa83d7416 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -95,9 +95,13 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # to split the samples samples[start:end]. # Oblique Splitting extra parameters - cdef public float64_t feature_combinations # Number of features to combine + cdef public float64_t feature_combinations # Number of features to combine cdef intp_t n_non_zeros # Number of non-zero features - cdef intp_t[::1] indices_to_sample # an array of indices to sample of size mtry X n_features + cdef intp_t[::1] indices_to_sample # An array of indices to sample of size mtry X n_features + # # to sample from that produces a non-zero feature combination. + # # This array is multiplied by the data matrix n_samples X n_features + # # to produce a non-zero feature combination of size + # # n_samples X mtry. # All oblique splitters (i.e. non-axis aligned splitters) require a # function to sample a projection matrix that is applied to the feature matrix @@ -139,10 +143,10 @@ cdef class RandomObliqueSplitter(ObliqueSplitter): # XXX: This splitter is experimental. Expect changes frequently. cdef class MultiViewSplitter(BestObliqueSplitter): - cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set + cdef const intp_t[:] feature_set_ends # an array indicating the column indices of the end of each feature set cdef intp_t n_feature_sets # the number of feature sets is the length of feature_set_ends + 1 - cdef const intp_t[:] max_features_per_set # the maximum number of features to sample from each feature set + cdef const intp_t[:] max_features_per_set # the maximum number of features to sample from each feature set cdef vector[vector[intp_t]] multi_indices_to_sample diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 8dd6ea704..4e4f0860c 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -331,8 +331,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) - with gil: - print("Finished sampling projection matrix") + # with gil: + # print("Finished sampling projection matrix") # For every vector in the projection matrix for feat_i in range(max_features): @@ -733,12 +733,15 @@ cdef class MultiViewSplitter(BestObliqueSplitter): cdef intp_t size_of_feature_set, size_of_sampling cdef intp_t ifeat = 0 for i_feature in range(self.n_feature_sets): + # n_features * max_features_per_set size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin size_of_sampling = self.max_features_per_set[i_feature] * size_of_feature_set # push an index corresponding to each element we want to sample for ifeat in range(size_of_sampling): self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) + + print(i_feature, ifeat + feature_set_begin) feature_set_begin = self.feature_set_ends[i_feature] return 0 @@ -791,9 +794,15 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # here, axis-aligned splits are entirely weights of 1 weight = 1 # if (rand_int(0, 2, random_state) == 1) else -1 - proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_indices[proj_i].push_back(feat_i) # Store vectorized index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + # XXX: debug only + if feat_i > self.n_features: + with gil: + print(idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) + + # break early if we've sampled enough features proj_i += 1 if proj_i >= self.max_features: break @@ -858,8 +867,6 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates # them independently. - with gil: - print("Starting to sample projection matrix", self.n_feature_sets) # sample from a feature set using linear combinations among the two sets for idx in range(self.n_feature_sets): # indices to sample is a 1D-index array of size (max_features * n_features_in_set) @@ -873,10 +880,10 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): indices_to_sample[j], indices_to_sample[i] = \ indices_to_sample[i], indices_to_sample[j] - with gil: - print(idx, "Finished fisher yates...") - print(len(self.n_non_zeros_per_set), len(self.max_features_per_set), len(self.multi_indices_to_sample)) - print(len(indices_to_sample), grid_size, self.n_non_zeros_per_set[idx]) + # with gil: + # print(idx, "Finished fisher yates...") + # print(len(self.n_non_zeros_per_set), len(self.max_features_per_set), len(self.multi_indices_to_sample)) + # print(len(indices_to_sample), grid_size, self.n_non_zeros_per_set[idx]) # we want "n_non_zeros / K" for this feature set over K feature sets for i in range(0, self.n_non_zeros_per_set[idx]): @@ -891,14 +898,9 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): # sample a random weight weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 - # with gil: - # print(i, proj_i, feat_i) proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - # the new beginning is the previous end - # feature_set_begin = feature_set_end - cdef class MultiViewSplitterTester(MultiViewSplitter): """A class to expose a Python interface for testing.""" diff --git a/sktree/tree/tests/test_all_trees.py b/sktree/tree/tests/test_all_trees.py index 6d06e0fcd..f161cc57b 100644 --- a/sktree/tree/tests/test_all_trees.py +++ b/sktree/tree/tests/test_all_trees.py @@ -10,6 +10,7 @@ ExtraObliqueDecisionTreeClassifier, ExtraObliqueDecisionTreeRegressor, MultiViewDecisionTreeClassifier, + MultiViewObliqueDecisionTreeClassifier, ObliqueDecisionTreeClassifier, ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeClassifier, @@ -28,7 +29,7 @@ UnsupervisedDecisionTree, UnsupervisedObliqueDecisionTree, MultiViewDecisionTreeClassifier, - # MultiViewObliqueDecisionTreeClassifier, + MultiViewObliqueDecisionTreeClassifier, ] @@ -122,6 +123,7 @@ def assert_tree_equal(d, s, message): ] +@pytest.mark.skip() @pytest.mark.parametrize( "TREE", ALL_TREES, @@ -135,6 +137,7 @@ def test_tree_deserialization_from_read_only_buffer(tmpdir, TREE): clf = TREE(random_state=0) if is_classifier(TREE) or is_regressor(TREE): + print(X_small.shape) clf.fit(X_small, y_small) else: clf.fit(X_small) diff --git a/test_mvoblique_tree.py b/test_mvoblique_tree.py index 41fd7c357..c72aaba1b 100644 --- a/test_mvoblique_tree.py +++ b/test_mvoblique_tree.py @@ -18,45 +18,63 @@ rng = np.random.default_rng(seed=seed) -n_samples = 20 -n_features_1 = 5 -n_features_2 = 1000 -cluster_std = 5.0 - -# Create a high-dimensional multiview dataset with a low-dimensional informative -# subspace in one view of the dataset. -X0_first, y0 = make_blobs( - n_samples=n_samples, - cluster_std=cluster_std, - n_features=n_features_1, - random_state=rng.integers(1, 10000), - centers=1, -) -X1_first, y1 = make_blobs( - n_samples=n_samples, - cluster_std=cluster_std, - n_features=n_features_1, - random_state=rng.integers(1, 10000), - centers=1, -) -y1[:] = 1 -X0 = np.concatenate([X0_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) -X1 = np.concatenate([X1_first, rng.standard_normal(size=(n_samples, n_features_2))], axis=1) -X = np.vstack((X0, X1)) -y = np.hstack((y0, y1)).T - -# Compare multiview decision tree vs single-view decision tree -clf = MultiViewObliqueDecisionTreeClassifier( - random_state=seed, - feature_set_ends=[n_features_1, X.shape[1]], - max_features=0.3, +X_small = np.array( + [ + [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0], + [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1], + [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1], + [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1], + [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1], + [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], + [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0], + [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0], + [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0], + [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0], + [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0], + [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1], + [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1], + [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1], + [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1], + [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1], + [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1], + [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1], + [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0], + ] ) -print(X.shape) -clf.fit(X, y) -assert ( - accuracy_score(y, clf.predict(X)) == 1.0 -), f"Accuracy score: {accuracy_score(y, clf.predict(X))}" -assert ( - cross_val_score(clf, X, y, cv=5).mean() > 0.9 -), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" + +y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] +y_small_reg = [ + 1.0, + 2.1, + 1.2, + 0.05, + 10, + 2.4, + 3.1, + 1.01, + 0.01, + 2.98, + 3.1, + 1.1, + 0.0, + 1.2, + 2, + 11, + 0, + 0, + 4.5, + 0.201, + 1.06, + 0.9, + 0, +] + +clf = MultiViewDecisionTreeClassifier(random_state=0) + +print(X_small.shape) +clf.fit(X_small, y_small) From 01cab41c09c23842ad42226c2d8dca8d0373244a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 3 Jul 2024 17:15:17 -0400 Subject: [PATCH 08/26] WIP Signed-off-by: Adam Li --- benchmarks_nonasv/bench_forestht.py | 2 + sktree/stats/forestht.py | 1 + sktree/tree/_multiview.py | 1 + sktree/tree/_oblique_splitter.pxd | 10 +++- sktree/tree/_oblique_splitter.pyx | 69 +++++++++++++++------ sktree/tree/tests/test_multiview.py | 2 + test_mvoblique_tree.py | 93 +++++++++++------------------ 7 files changed, 99 insertions(+), 79 deletions(-) diff --git a/benchmarks_nonasv/bench_forestht.py b/benchmarks_nonasv/bench_forestht.py index 59e4dff9b..2bf0e6926 100644 --- a/benchmarks_nonasv/bench_forestht.py +++ b/benchmarks_nonasv/bench_forestht.py @@ -13,6 +13,8 @@ import seaborn as sns from scipy.special import expit +# using an outdated API, but the code could get refactored to use our new API +# build_coleman_forest, build_oob_forest, etc. from sktree.stats import PermutationForestClassifier, PermutationForestRegressor seed = 12345 diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index b71081806..163341303 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -140,6 +140,7 @@ def build_coleman_forest( if y.ndim == 1: y = y.reshape(-1, 1) + metric_star, metric_star_pi = _compute_null_distribution_coleman( y, orig_forest_proba, diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 6ac4ad494..3934d3e31 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -441,6 +441,7 @@ def _build_tree( # the total number of features to sample per split self.max_features_ = np.sum(self.max_features_per_set_) + print(self.max_features_, self.max_features_per_set_, self.feature_set_ends_, self.n_features_in_set_) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( criterion, diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index fa83d7416..9f6df2d3d 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -96,8 +96,11 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # Oblique Splitting extra parameters cdef public float64_t feature_combinations # Number of features to combine - cdef intp_t n_non_zeros # Number of non-zero features - cdef intp_t[::1] indices_to_sample # An array of indices to sample of size mtry X n_features + cdef intp_t n_non_zeros # Number of non-zero features to sample per projection matrix + + # Oblique Splitting extra parameters (mtry, n_dims) matrix + # This will contain indices 0 to mtry*n_features to allow efficient shuffling. + cdef intp_t[::1] indices_to_sample # A 2D array of indices to sample of size mtry X n_features # # to sample from that produces a non-zero feature combination. # # This array is multiplied by the data matrix n_samples X n_features # # to produce a non-zero feature combination of size @@ -148,6 +151,9 @@ cdef class MultiViewSplitter(BestObliqueSplitter): cdef const intp_t[:] max_features_per_set # the maximum number of features to sample from each feature set + # Each feature set has a different set of indices to sample from with a potentially different + # max_features argument. This is a 2D array of indices to sample of size mtry_in_set X features_in_set + # to sample from that produces a non-zero feature combination for each feature set. cdef vector[vector[intp_t]] multi_indices_to_sample cdef void sample_proj_mat( diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 4e4f0860c..aee8e2881 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -8,6 +8,7 @@ import numpy as np from cython.operator cimport dereference as deref from libcpp.vector cimport vector +from libcpp.algorithm cimport swap from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform @@ -132,6 +133,17 @@ cdef class BaseObliqueSplitter(Splitter): intp_t grid_size, uint32_t* random_state, ) noexcept nogil: + """Fisher-Yates shuffle for a 1D memoryview of indices. + + Parameters + ---------- + indices_to_sample : memoryview of intp_t + The memoryview of indices to shuffle. + grid_size : intp_t + The number of times to shuffle the array. + random_state : uint32_t* + The random state to use for pseudo-randomness. + """ cdef intp_t i, j # XXX: should this be `i` or `i+1`? for valid Fisher-Yates? @@ -254,7 +266,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): # construct an array to sample from mTry x n_features set of indices cdef intp_t[::1] indices_to_sample = self.indices_to_sample - cdef intp_t grid_size = self.max_features * self.n_features + cdef intp_t grid_size = len(indices_to_sample) # shuffle indices over the 2D grid to sample using Fisher-Yates self.fisher_yates_shuffle_memview(indices_to_sample, grid_size, random_state) @@ -331,8 +343,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) - # with gil: - # print("Finished sampling projection matrix") + with gil: + print("Finished sampling projection matrix") # For every vector in the projection matrix for feat_i in range(max_features): @@ -408,6 +420,8 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Account for projection vector temp_d = 0.0 for j in range(best_split.proj_vec_indices.size()): + with gil: + print(self.X.shape, samples[p], j, deref(best_split.proj_vec_indices)[j]) temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ deref(best_split.proj_vec_weights)[j] @@ -691,12 +705,6 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # replaces usage of max_features self.max_features_per_set = max_features_per_set - def __getstate__(self): - return {} - - def __setstate__(self, d): - pass - def __reduce__(self): """Enable pickling the splitter.""" return (type(self), @@ -732,16 +740,24 @@ cdef class MultiViewSplitter(BestObliqueSplitter): cdef intp_t feature_set_begin = 0 cdef intp_t size_of_feature_set, size_of_sampling cdef intp_t ifeat = 0 + cdef intp_t iproj = 0 + + # the index to sample in the vectorized mtry x n_features grid + cdef intp_t index + for i_feature in range(self.n_feature_sets): # n_features * max_features_per_set size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin size_of_sampling = self.max_features_per_set[i_feature] * size_of_feature_set # push an index corresponding to each element we want to sample + # this pushes indices mtry_in_set * n_features_in_set for ifeat in range(size_of_sampling): - self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) - - print(i_feature, ifeat + feature_set_begin) + # index of the sampled feature in this feature set + feature set offset + projection offset + index = ifeat + feature_set_begin + (iproj * self.n_features) + self.multi_indices_to_sample[i_feature].push_back(index) + print('Inside init: ', i_feature, index, size_of_sampling, size_of_feature_set) + iproj += 1 feature_set_begin = self.feature_set_ends[i_feature] return 0 @@ -756,7 +772,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): but now also uniformly samples features from each feature set. """ cdef uint32_t* random_state = &self.rand_r_state - cdef intp_t feat_i, proj_i + cdef intp_t feat_i, proj_i, rand_vec_index cdef float32_t weight # keep track of the beginning and ending indices of each feature set @@ -776,23 +792,41 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # each feature-set equally while proj_i < self.max_features: # sample from a feature set + with gil: + print('Sampling projection: ', proj_i, self.n_samples, self.n_features, + self.max_features, self.n_feature_sets, + list(self.feature_set_ends[:]), + list(self.max_features_per_set[:])) for idx in range(self.n_feature_sets): # get the max-features for this feature-set max_features = self.max_features_per_set[idx] grid_size = self.multi_indices_to_sample[idx].size() + with gil: + print(self.multi_indices_to_sample[0].size()) + print(self.multi_indices_to_sample[1].size()) + + # for i in range(0, grid_size - 1): + # j = rand_int(i + 1, grid_size, random_state) + # swap[intp_t](self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j]) + # Note: a temporary variable must not be used, else a copy will be made - for i in range(0, self.multi_indices_to_sample[idx].size() - 1): + for i in range(0, grid_size - 1): j = rand_int(i + 1, grid_size, random_state) self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = \ self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] for ifeature in range(max_features): # sample random feature in this set - feat_i = self.multi_indices_to_sample[idx][ifeature] + rand_vec_index = self.multi_indices_to_sample[idx][ifeature] # here, axis-aligned splits are entirely weights of 1 - weight = 1 # if (rand_int(0, 2, random_state) == 1) else -1 + weight = 1 + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + proj_i = rand_vec_index // self.n_features + feat_i = rand_vec_index % self.n_features proj_mat_indices[proj_i].push_back(feat_i) # Store vectorized index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero @@ -800,7 +834,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # XXX: debug only if feat_i > self.n_features: with gil: - print(idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) + print('Sampling projection: ', idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) # break early if we've sampled enough features proj_i += 1 @@ -809,7 +843,6 @@ cdef class MultiViewSplitter(BestObliqueSplitter): if proj_i >= self.max_features: break - # TODO: need to check segfault for multiview oblique splitter # REBUILD WITH BOUNDS CHECK cdef class MultiViewObliqueSplitter(MultiViewSplitter): diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py index ad5bf143a..0d6bc4be3 100644 --- a/sktree/tree/tests/test_multiview.py +++ b/sktree/tree/tests/test_multiview.py @@ -27,6 +27,8 @@ def test_sklearn_compatible_estimator(estimator, check): check(estimator) + +@pytest.mark.skip() @pytest.mark.parametrize( "est", [MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier] ) diff --git a/test_mvoblique_tree.py b/test_mvoblique_tree.py index c72aaba1b..347bb8d0f 100644 --- a/test_mvoblique_tree.py +++ b/test_mvoblique_tree.py @@ -18,63 +18,38 @@ rng = np.random.default_rng(seed=seed) - -X_small = np.array( - [ - [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0], - [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1], - [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1], - [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1], - [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1], - [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1], - [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1], - [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], - [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1], - [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0], - [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0], - [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0], - [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0], - [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0], - [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1], - [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1], - [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1], - [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1], - [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1], - [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1], - [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1], - [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1], - [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0], - ] +X = np.random.random((20, 10)) +y = np.random.randint(0, 2, size=20) + +# test with max_features as a float +clf = MultiViewDecisionTreeClassifier( + random_state=seed, + feature_set_ends=[6, 10], + max_features=0.5, ) - -y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0] -y_small_reg = [ - 1.0, - 2.1, - 1.2, - 0.05, - 10, - 2.4, - 3.1, - 1.01, - 0.01, - 2.98, - 3.1, - 1.1, - 0.0, - 1.2, - 2, - 11, - 0, - 0, - 4.5, - 0.201, - 1.06, - 0.9, - 0, -] - -clf = MultiViewDecisionTreeClassifier(random_state=0) - -print(X_small.shape) -clf.fit(X_small, y_small) +clf.fit(X, y) + +assert_array_equal(clf.max_features_per_set_, [3, 2]) +assert clf.max_features_ == 5 + +# test with max_features as sqrt +# X = np.random.random((20, 13)) +# clf = MultiViewDecisionTreeClassifier( +# random_state=seed, +# feature_set_ends=[9, 13], +# max_features="sqrt", +# ) +# clf.fit(X, y) +# assert_array_equal(clf.max_features_per_set_, [3, 2]) +# assert clf.max_features_ == 5 + +# # test with max_features as 'sqrt' but not a perfect square +# X = np.random.random((20, 9)) +# clf = MultiViewDecisionTreeClassifier( +# random_state=seed, +# feature_set_ends=[5, 9], +# max_features="sqrt", +# ) +# clf.fit(X, y) +# assert_array_equal(clf.max_features_per_set_, [3, 2]) +# assert clf.max_features_ == 5 \ No newline at end of file From 04daabede43b2a9ed179fd609deb7ffef4d538a3 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Wed, 3 Jul 2024 17:15:29 -0400 Subject: [PATCH 09/26] WIP Signed-off-by: Adam Li --- sktree/tree/_multiview.py | 7 ++++++- sktree/tree/_oblique_splitter.pyx | 8 ++++---- sktree/tree/tests/test_multiview.py | 1 - test_mvoblique_tree.py | 2 +- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 3934d3e31..80b570014 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -441,7 +441,12 @@ def _build_tree( # the total number of features to sample per split self.max_features_ = np.sum(self.max_features_per_set_) - print(self.max_features_, self.max_features_per_set_, self.feature_set_ends_, self.n_features_in_set_) + print( + self.max_features_, + self.max_features_per_set_, + self.feature_set_ends_, + self.n_features_in_set_, + ) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( criterion, diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index aee8e2881..10c03e972 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -7,8 +7,8 @@ import numpy as np from cython.operator cimport dereference as deref -from libcpp.vector cimport vector from libcpp.algorithm cimport swap +from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform @@ -756,7 +756,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # index of the sampled feature in this feature set + feature set offset + projection offset index = ifeat + feature_set_begin + (iproj * self.n_features) self.multi_indices_to_sample[i_feature].push_back(index) - print('Inside init: ', i_feature, index, size_of_sampling, size_of_feature_set) + print("Inside init: ", i_feature, index, size_of_sampling, size_of_feature_set) iproj += 1 feature_set_begin = self.feature_set_ends[i_feature] return 0 @@ -793,7 +793,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): while proj_i < self.max_features: # sample from a feature set with gil: - print('Sampling projection: ', proj_i, self.n_samples, self.n_features, + print("Sampling projection: ", proj_i, self.n_samples, self.n_features, self.max_features, self.n_feature_sets, list(self.feature_set_ends[:]), list(self.max_features_per_set[:])) @@ -834,7 +834,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # XXX: debug only if feat_i > self.n_features: with gil: - print('Sampling projection: ', idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) + print("Sampling projection: ", idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) # break early if we've sampled enough features proj_i += 1 diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py index 0d6bc4be3..508c02d14 100644 --- a/sktree/tree/tests/test_multiview.py +++ b/sktree/tree/tests/test_multiview.py @@ -27,7 +27,6 @@ def test_sklearn_compatible_estimator(estimator, check): check(estimator) - @pytest.mark.skip() @pytest.mark.parametrize( "est", [MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier] diff --git a/test_mvoblique_tree.py b/test_mvoblique_tree.py index 347bb8d0f..1fd1124ee 100644 --- a/test_mvoblique_tree.py +++ b/test_mvoblique_tree.py @@ -52,4 +52,4 @@ # ) # clf.fit(X, y) # assert_array_equal(clf.max_features_per_set_, [3, 2]) -# assert clf.max_features_ == 5 \ No newline at end of file +# assert clf.max_features_ == 5 From ac87b0725f40fe1f70b9b41c78591f5f06dff152 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 11:44:05 -0400 Subject: [PATCH 10/26] Working prototype for multiview oblique Signed-off-by: Adam Li --- .../plot_multiview_axis_aligned_splitter.py | 142 ++++++++- sktree/tree/_multiview.py | 66 ++++- sktree/tree/_oblique_splitter.pxd | 4 +- sktree/tree/_oblique_splitter.pyx | 276 ++++++++++++------ sktree/tree/tests/test_multiview.py | 8 +- test_mvoblique_tree.py | 55 ---- 6 files changed, 391 insertions(+), 160 deletions(-) delete mode 100644 test_mvoblique_tree.py diff --git a/examples/splitters/plot_multiview_axis_aligned_splitter.py b/examples/splitters/plot_multiview_axis_aligned_splitter.py index 00b8c0280..81ae5fae3 100644 --- a/examples/splitters/plot_multiview_axis_aligned_splitter.py +++ b/examples/splitters/plot_multiview_axis_aligned_splitter.py @@ -28,10 +28,10 @@ from matplotlib.colors import ListedColormap from sktree._lib.sklearn.tree._criterion import Gini -from sktree.tree._oblique_splitter import MultiViewSplitterTester +from sktree.tree._oblique_splitter import MultiViewObliqueSplitterTester, MultiViewSplitterTester criterion = Gini(1, np.array((0, 1))) -max_features = 5 +max_features = 6 min_samples_leaf = 1 min_weight_leaf = 0.0 random_state = np.random.RandomState(10) @@ -40,7 +40,7 @@ feature_set_ends = np.array([3, 5, 9], dtype=np.intp) n_feature_sets = len(feature_set_ends) -max_features_per_set_ = None +max_features_per_set_ = np.array([2, 2, 2]) feature_combinations = 1 monotonic_cst = None missing_value_feature_mask = None @@ -99,7 +99,11 @@ for iend in feature_set_ends[1:]: ax.axvline(iend - 0.5, color="black", linewidth=1) -ax.set(title="Sampled Projection Matrix", xlabel="Feature Index", ylabel="Projection Vector Index") +ax.set( + title="Sampled Projection Matrix: \nMultiview Axis Aligned Split with Equal Max_Features", + xlabel="Feature Index", + ylabel="Projection Vector Index", +) ax.set_xticks(np.arange(feature_set_ends[-1])) ax.set_yticks(np.arange(max_features)) ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) @@ -115,6 +119,7 @@ colorbar.set_label("Projection Weight (I.e. Sampled Feature From a Feature Set)") colorbar.ax.set_yticklabels(["0", "1"]) +fig.tight_layout() plt.show() # %% @@ -160,7 +165,11 @@ for iend in feature_set_ends[1:]: ax.axvline(iend - 0.5, color="black", linewidth=1) -ax.set(title="Sampled Projection Matrix", xlabel="Feature Index", ylabel="Projection Vector Index") +ax.set( + title="Sampled Projection Matrix:\n Multiview Axis-aligned Splitter", + xlabel="Feature Index", + ylabel="Projection Vector Index", +) ax.set_xticks(np.arange(feature_set_ends[-1])) ax.set_yticks(np.arange(max_features)) ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) @@ -176,6 +185,129 @@ colorbar.set_label("Projection Weight (I.e. Sampled Feature From a Feature Set)") colorbar.ax.set_yticklabels(["0", "1"]) +fig.tight_layout() +plt.show() + +# %% +# Sampling multiview oblique splits +# --------------------------------- +# The multi-view splitter can also sample oblique splits. The oblique splits are +# generated by sampling a projection matrix and then transforming the data into the +# projected space. + +feature_combinations = 1.5 +cross_feature_set_sampling = False +splitter = MultiViewObliqueSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + feature_set_ends, + n_feature_sets, + max_features_per_set_, + cross_feature_set_sampling, +) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) + +# sample the projection matrix +projection_matrix = splitter.sample_projection_matrix_py() +print(projection_matrix) + +cmap = ListedColormap(["orange", "white", "green"]) + +# Create a heatmap to visualize the indices +fig, ax = plt.subplots(figsize=(6, 6)) + +ax.imshow( + projection_matrix, cmap=cmap, aspect=feature_set_ends[-1] / max_features, interpolation="none" +) +ax.axvline(feature_set_ends[0] - 0.5, color="black", linewidth=1, label="Feature Sets") +for iend in feature_set_ends[1:]: + ax.axvline(iend - 0.5, color="black", linewidth=1) + +ax.set( + title="Sampled Projection Matrix:\n Multiview Oblique Splits W/O Cross-Feature Sampling", + xlabel="Feature Index", + ylabel="Projection Vector Index", +) +ax.set_xticks(np.arange(feature_set_ends[-1])) +ax.set_yticks(np.arange(max_features)) +ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) +ax.set_xticklabels(np.arange(feature_set_ends[-1], dtype=int) + 1) +ax.legend() + +# Create a mappable object +sm = ScalarMappable(cmap=cmap) +sm.set_array([]) # You can set an empty array or values here + +# Create a color bar with labels for each feature set +colorbar = fig.colorbar(sm, ax=ax, ticks=[0, 0.5, 1], format="%d") +colorbar.set_label("Projection Weight") +colorbar.ax.set_yticklabels(["-1", "0", "1"]) + +fig.tight_layout() +plt.show() + +# %% +# Sampling multiview oblique splits with cross-feature-set sampling. +# Now, we can also sample across feature sets within each projection vector. + +cross_feature_set_sampling = True +splitter = MultiViewObliqueSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + monotonic_cst, + feature_combinations, + feature_set_ends, + n_feature_sets, + max_features_per_set_, + cross_feature_set_sampling, +) +splitter.init_test(X, y, sample_weight, missing_value_feature_mask) + +# sample the projection matrix +projection_matrix = splitter.sample_projection_matrix_py() +print(projection_matrix) + +cmap = ListedColormap(["orange", "white", "green"]) + +# Create a heatmap to visualize the indices +fig, ax = plt.subplots(figsize=(6, 6)) + +ax.imshow( + projection_matrix, cmap=cmap, aspect=feature_set_ends[-1] / max_features, interpolation="none" +) +ax.axvline(feature_set_ends[0] - 0.5, color="black", linewidth=1, label="Feature Sets") +for iend in feature_set_ends[1:]: + ax.axvline(iend - 0.5, color="black", linewidth=1) + +ax.set( + title="Sampled Projection Matrix:\n Multiview Oblique Splits W/ Cross-Feature Sampling", + xlabel="Feature Index", + ylabel="Projection Vector Index", +) +ax.set_xticks(np.arange(feature_set_ends[-1])) +ax.set_yticks(np.arange(max_features)) +ax.set_yticklabels(np.arange(max_features, dtype=int) + 1) +ax.set_xticklabels(np.arange(feature_set_ends[-1], dtype=int) + 1) +ax.legend() + +# Create a mappable object +sm = ScalarMappable(cmap=cmap) +sm.set_array([]) # You can set an empty array or values here + +# Create a color bar with labels for each feature set +colorbar = fig.colorbar(sm, ax=ax, ticks=[0, 0.5, 1], format="%d") +colorbar.set_label("Projection Weight") +colorbar.ax.set_yticklabels(["-1", "0", "1"]) + +fig.tight_layout() plt.show() # %% diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 80b570014..e159bb621 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -441,12 +441,6 @@ def _build_tree( # the total number of features to sample per split self.max_features_ = np.sum(self.max_features_per_set_) - print( - self.max_features_, - self.max_features_per_set_, - self.feature_set_ends_, - self.n_features_in_set_, - ) if not isinstance(self.splitter, ObliqueSplitter): splitter = SPLITTERS[self.splitter]( criterion, @@ -851,6 +845,7 @@ class MultiViewObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassif "array-like", None, ] + _parameter_constraints["cross_feature_set_sampling"] = ["boolean"] def __init__( self, @@ -871,6 +866,7 @@ def __init__( monotonic_cst=None, feature_set_ends=None, feature_combinations=None, + cross_feature_set_sampling=False, ): super().__init__( criterion=criterion, @@ -891,6 +887,7 @@ def __init__( self.feature_set_ends = feature_set_ends self.feature_combinations = feature_combinations + self.cross_feature_set_sampling = cross_feature_set_sampling self._max_features_arr = None def _build_tree( @@ -1053,6 +1050,7 @@ def _build_tree( self.feature_set_ends_, self.n_feature_sets_, self.max_features_per_set_, + self.cross_feature_set_sampling, ) self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) @@ -1084,6 +1082,62 @@ def _build_tree( self.n_classes_ = self.n_classes_[0] self.classes_ = self.classes_[0] + def _fit( + self, + X, + y, + sample_weight=None, + check_input=True, + missing_values_in_feature_mask=None, + classes=None, + ): + # XXX: BaseDecisionTree does a check that requires max_features to not be a list/array-like + # so we need to temporarily set it to an acceptable value + # in the meantime, we will reset: + # - self.max_features_ to the original value + # - self.max_features_arr contains a possible array-like setting of max_features + self._max_features_arr = self.max_features + self.max_features = None + super()._fit(X, y, sample_weight, check_input, missing_values_in_feature_mask, classes) + self.max_features = self._max_features_arr + return self + + def fit(self, X, y, sample_weight=None, check_input=True, classes=None): + """Build a decision tree classifier from the training set (X, y). + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + + check_input : bool, default=True + Allow to bypass several input checking. + Don't use this parameter unless you know what you're doing. + + classes : array-like of shape (n_classes,), default=None + List of all the classes that can possibly appear in the y vector. + + Returns + ------- + self : MultiViewDecisionTreeClassifier + Fitted estimator. + """ + return self._fit( + X, y, sample_weight=sample_weight, check_input=check_input, classes=classes + ) + @property def _inheritable_fitted_attribute(self): """Define additional attributes to pass onto a parent meta tree-estimator. diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index 9f6df2d3d..aea477043 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -165,7 +165,9 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # XXX: This splitter is experimental. Expect changes frequently. cdef class MultiViewObliqueSplitter(MultiViewSplitter): - cdef const intp_t[:] n_non_zeros_per_set # the number of non-zero features in each feature set + # cdef const intp_t[:] n_non_zeros_per_set # the number of non-zero features in each feature set + cdef intp_t _max_feature_combinations # Number of non-zero features to sample per projection matrix + cdef bint cross_feature_set_sampling # Whether we sample across feature set when creating a projection vector cdef void sample_proj_mat( self, diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 10c03e972..d52d760c1 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -7,6 +7,7 @@ import numpy as np from cython.operator cimport dereference as deref +from libc.math cimport ceil from libcpp.algorithm cimport swap from libcpp.vector cimport vector @@ -134,7 +135,7 @@ cdef class BaseObliqueSplitter(Splitter): uint32_t* random_state, ) noexcept nogil: """Fisher-Yates shuffle for a 1D memoryview of indices. - + Parameters ---------- indices_to_sample : memoryview of intp_t @@ -261,8 +262,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t n_non_zeros = self.n_non_zeros cdef uint32_t* random_state = &self.rand_r_state - cdef intp_t i, feat_i, proj_i, rand_vec_index - cdef float32_t weight + cdef intp_t i, rand_vec_index # construct an array to sample from mTry x n_features set of indices cdef intp_t[::1] indices_to_sample = self.indices_to_sample @@ -288,6 +288,7 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + cdef class BestObliqueSplitter(ObliqueSplitter): def __reduce__(self): """Enable pickling the splitter.""" @@ -343,9 +344,6 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Sample the projection matrix self.sample_proj_mat(self.proj_mat_weights, self.proj_mat_indices) - with gil: - print("Finished sampling projection matrix") - # For every vector in the projection matrix for feat_i in range(max_features): # Projection vector has no nonzeros @@ -420,8 +418,6 @@ cdef class BestObliqueSplitter(ObliqueSplitter): # Account for projection vector temp_d = 0.0 for j in range(best_split.proj_vec_indices.size()): - with gil: - print(self.X.shape, samples[p], j, deref(best_split.proj_vec_indices)[j]) temp_d += self.X[samples[p], deref(best_split.proj_vec_indices)[j]] *\ deref(best_split.proj_vec_weights)[j] @@ -738,27 +734,25 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # create a helper array for allowing efficient Fisher-Yates cdef intp_t i_feature = 0 cdef intp_t feature_set_begin = 0 - cdef intp_t size_of_feature_set, size_of_sampling + cdef intp_t size_of_feature_set cdef intp_t ifeat = 0 - cdef intp_t iproj = 0 - - # the index to sample in the vectorized mtry x n_features grid - cdef intp_t index - + + # Here, we sample the indices of the features to sample in each feature set + # as a separate vector. This is done to allow for efficient Fisher-Yates + # shuffling of the indices, such that we randomly sample features to consider, but within + # each feature set separately. This ensures that the sampled projection matrix consists of + # a balanced number of features from each feature set. + # + # Example: + # multi_indices_to_sample[0] = [0, 1, 2, 3] + # multi_indices_to_sample[1] = [4, 5] + # which corresponds to a feature set with 4 features and another with 2 features. for i_feature in range(self.n_feature_sets): - # n_features * max_features_per_set size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - size_of_sampling = self.max_features_per_set[i_feature] * size_of_feature_set - - # push an index corresponding to each element we want to sample - # this pushes indices mtry_in_set * n_features_in_set - for ifeat in range(size_of_sampling): - # index of the sampled feature in this feature set + feature set offset + projection offset - index = ifeat + feature_set_begin + (iproj * self.n_features) - self.multi_indices_to_sample[i_feature].push_back(index) - print("Inside init: ", i_feature, index, size_of_sampling, size_of_feature_set) - iproj += 1 + for ifeat in range(size_of_feature_set): + self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) feature_set_begin = self.feature_set_ends[i_feature] + return 0 cdef void sample_proj_mat( @@ -772,7 +766,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): but now also uniformly samples features from each feature set. """ cdef uint32_t* random_state = &self.rand_r_state - cdef intp_t feat_i, proj_i, rand_vec_index + cdef intp_t feat_i, proj_i cdef float32_t weight # keep track of the beginning and ending indices of each feature set @@ -786,57 +780,33 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # of candidates, but if one feature set is exhausted, then that one is no longer sampled cdef intp_t i, j + # keep track of which mtry we are on proj_i = 0 # 02: Algorithm samples a different number features from each set, but considers # each feature-set equally while proj_i < self.max_features: # sample from a feature set - with gil: - print("Sampling projection: ", proj_i, self.n_samples, self.n_features, - self.max_features, self.n_feature_sets, - list(self.feature_set_ends[:]), - list(self.max_features_per_set[:])) for idx in range(self.n_feature_sets): # get the max-features for this feature-set max_features = self.max_features_per_set[idx] grid_size = self.multi_indices_to_sample[idx].size() - with gil: - print(self.multi_indices_to_sample[0].size()) - print(self.multi_indices_to_sample[1].size()) - - # for i in range(0, grid_size - 1): - # j = rand_int(i + 1, grid_size, random_state) - # swap[intp_t](self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j]) - # Note: a temporary variable must not be used, else a copy will be made for i in range(0, grid_size - 1): j = rand_int(i + 1, grid_size, random_state) - self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j] = \ - self.multi_indices_to_sample[idx][j], self.multi_indices_to_sample[idx][i] + swap[intp_t](self.multi_indices_to_sample[idx][i], self.multi_indices_to_sample[idx][j]) for ifeature in range(max_features): # sample random feature in this set - rand_vec_index = self.multi_indices_to_sample[idx][ifeature] + feat_i = self.multi_indices_to_sample[idx][ifeature] # here, axis-aligned splits are entirely weights of 1 - weight = 1 - - # get the projection index (i.e. row of the projection matrix) and - # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // self.n_features - feat_i = rand_vec_index % self.n_features + weight = 1 # if (rand_int(0, 2, random_state) == 1) else -1 - proj_mat_indices[proj_i].push_back(feat_i) # Store vectorized index of nonzero + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero - # XXX: debug only - if feat_i > self.n_features: - with gil: - print("Sampling projection: ", idx, ifeature, proj_i, self.n_samples, self.n_features, feat_i) - - # break early if we've sampled enough features proj_i += 1 if proj_i >= self.max_features: break @@ -858,6 +828,7 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): const intp_t[:] feature_set_ends, intp_t n_feature_sets, const intp_t[:] max_features_per_set, + bint cross_feature_set_sampling, *argv ): self.feature_set_ends = feature_set_ends @@ -869,11 +840,46 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): self.max_features_per_set = max_features_per_set # compute # of non-zeros expected on average per feature set - cdef intp_t[:] n_non_zeros_per_set = np.zeros(self.n_feature_sets, dtype=np.intp) - cdef intp_t i - for i in range(self.n_feature_sets): - n_non_zeros_per_set[i] = (self.max_features_per_set[i] * self.feature_combinations) - self.n_non_zeros_per_set = n_non_zeros_per_set + # cdef intp_t[:] n_non_zeros_per_set = np.zeros(self.n_feature_sets, dtype=np.intp) + # cdef intp_t i + # for i in range(self.n_feature_sets): + # n_non_zeros_per_set[i] = (self.max_features_per_set[i] * self.feature_combinations) + # self.n_non_zeros_per_set = n_non_zeros_per_set + + self._max_feature_combinations = ceil(self.feature_combinations) + self.cross_feature_set_sampling = cross_feature_set_sampling + + cdef int init( + self, + object X, + const float64_t[:, ::1] y, + const float64_t[:] sample_weight, + const unsigned char[::1] missing_values_in_feature_mask, + ) except -1: + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + + self.X = X + + # create a helper array for allowing efficient Fisher-Yates + self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) + + # Here, we sample the indices of the features to sample in each feature set + # as a separate vector. This is done to allow for efficient Fisher-Yates + # shuffling of the indices, such that we randomly sample features to consider, but within + # each feature set separately. This ensures that the sampled projection matrix consists of + # a balanced number of features from each feature set. + # + # Example: + # multi_indices_to_sample[0] = [0, 1, 2, 3] + # multi_indices_to_sample[1] = [4, 5] + # which corresponds to a feature set with 4 features and another with 2 features. + # for i_feature in range(self.n_feature_sets): + # size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin + # for ifeat in range(size_of_feature_set): + # self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) + # feature_set_begin = self.feature_set_ends[i_feature] + + return 0 cdef void sample_proj_mat( self, @@ -888,44 +894,52 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): cdef intp_t n_features = self.n_features cdef uint32_t* random_state = &self.rand_r_state - cdef intp_t i, j, feat_i, proj_i, rand_vec_index - cdef float32_t weight - - # construct an array to sample from mTry x n_features set of indices - cdef vector[intp_t] indices_to_sample - cdef intp_t grid_size + cdef intp_t i, rand_vec_index # keep track of the beginning and ending indices of each feature set cdef intp_t idx - # 02: Algorithm samples feature combinations from each feature set uniformly and evaluates - # them independently. - # sample from a feature set using linear combinations among the two sets + # random number of non-zeros to sample per projection vector + cdef intp_t n_non_zeros + cdef intp_t rand_feature_set + cdef intp_t current_feature_set_end = 0 + cdef intp_t n_features_in_set, n_features_in_set_buff + + # keep track of which projection vector we are analyzing + cdef intp_t proj_i = 0 + + # XXX: Compared to the oblique splitter, the multi-view oblique splitter differs in how + # it considers combinations of features. In the oblique splitter, we sample out of a mtry x n_features + # matrix, an expected number of non-zeros throughout the whole matrix. In the multi-view oblique splitter, + # we sample per mtry a non-zero projection vector. In the oblique splitter, this means that + # not every projection vector is actually non-zero, but in the multi-view oblique splitter, every + # projection vector is non-zero. + # + # As of 07/05/24, we could still change this in the oblique splitter, so we don't have trivial + # projection vectors. + + # The algorithm for sampling a multi-view projection matrix proceeds as follows: + # 0. for each feature set, with a possibly different max_features: + # 1. Determine the number of non-zeros we want to sample `rand_uniform(0, math.ceil(self.feature_combinations))`. + # 2a. [Optiona] If self.cross_feature_set_sampling, then while idx < n_non_zeros, sample a feature-set randomly + # 2b. sample a feature within feature-set randomly + # 2c. sample a weight randomly for idx in range(self.n_feature_sets): - # indices to sample is a 1D-index array of size (max_features * n_features_in_set) - # which is Fisher-Yates shuffled to sample random features in each feature set - indices_to_sample = self.multi_indices_to_sample[idx] - grid_size = indices_to_sample.size() - - # shuffle indices over the 2D grid for this feature set to sample using Fisher-Yates - for i in range(0, grid_size): - j = rand_int(0, grid_size, random_state) - indices_to_sample[j], indices_to_sample[i] = \ - indices_to_sample[i], indices_to_sample[j] - - # with gil: - # print(idx, "Finished fisher yates...") - # print(len(self.n_non_zeros_per_set), len(self.max_features_per_set), len(self.multi_indices_to_sample)) - # print(len(indices_to_sample), grid_size, self.n_non_zeros_per_set[idx]) - - # we want "n_non_zeros / K" for this feature set over K feature sets - for i in range(0, self.n_non_zeros_per_set[idx]): - # get the next index from the shuffled index array - rand_vec_index = indices_to_sample[i] + n_features_in_set = self.feature_set_ends[idx] - current_feature_set_end + + # 0. sample mtry projection vectors for this feature set + for jdx in range(self.max_features_per_set[idx]): + # 1. Determine the number of non-zeros we want to sample in this feature set's mtry + # We add 1 since the upper bound is exclusive + n_non_zeros = rand_int(0, self._max_feature_combinations + 1, random_state) + # sample a random feature in the current feature set + rand_vec_index = rand_int(0, n_features_in_set, random_state) + current_feature_set_end + + # push projection vector index and weight # get the projection index (i.e. row of the projection matrix) and # feature index (i.e. column of the projection matrix) - proj_i = rand_vec_index // n_features + # proj_i = rand_vec_index // n_features feat_i = rand_vec_index % n_features # sample a random weight @@ -934,6 +948,88 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + # sample 'n_non_zeros' in a mtry_per_feature_set X n_features projection matrix + for i in range(1, n_non_zeros): + if self.cross_feature_set_sampling: + # sample a feature set randomly if we allow cross-sampling + rand_feature_set = rand_int(0, self.n_feature_sets, random_state) + n_features_in_set_buff = self.feature_set_ends[rand_feature_set] + if rand_feature_set > 0: + n_features_in_set_buff -= self.feature_set_ends[rand_feature_set - 1] + else: + rand_feature_set = idx + n_features_in_set_buff = n_features_in_set + + # get another random feature in a possibly different feature set + rand_vec_index = rand_int(0, n_features_in_set_buff, random_state) + if rand_feature_set > 0: + rand_vec_index += self.feature_set_ends[rand_feature_set - 1] + + # get the projection index (i.e. row of the projection matrix) and + # feature index (i.e. column of the projection matrix) + # proj_i = rand_vec_index // n_features + feat_i = rand_vec_index % n_features + + # sample a random weight + weight = 1 if (rand_int(0, 2, random_state) == 1) else -1 + + proj_mat_indices[proj_i].push_back(feat_i) # Store index of nonzero + proj_mat_weights[proj_i].push_back(weight) # Store weight of nonzero + + # increment the projection vector we consider + proj_i += 1 + + # offset to sample features within the next feature set + current_feature_set_end = self.feature_set_ends[idx] + + +cdef class MultiViewObliqueSplitterTester(MultiViewObliqueSplitter): + """A class to expose a Python interface for testing.""" + + cpdef sample_projection_matrix_py(self): + """Sample projection matrix using a patch. + + Used for testing purposes. + + Returns projection matrix of shape (max_features, n_features). + """ + cdef vector[vector[float32_t]] proj_mat_weights = vector[vector[float32_t]](self.max_features) + cdef vector[vector[intp_t]] proj_mat_indices = vector[vector[intp_t]](self.max_features) + cdef intp_t i, j + + # sample projection matrix in C/C++ + self.sample_proj_mat(proj_mat_weights, proj_mat_indices) + + # convert the projection matrix to something that can be used in Python + proj_vecs = np.zeros((self.max_features, self.n_features), dtype=np.float32) + for i in range(0, self.max_features): + for j in range(0, proj_mat_weights[i].size()): + weight = proj_mat_weights[i][j] + feat = proj_mat_indices[i][j] + + proj_vecs[i, feat] = weight + + return proj_vecs + + cpdef init_test(self, X, y, sample_weight, missing_values_in_feature_mask=None): + """Initializes the state of the splitter. + + Used for testing purposes. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + The input samples. + y : array-like, shape (n_samples,) + The target values (class labels in classification, real numbers in + regression). + sample_weight : array-like, shape (n_samples,) + Sample weights. + missing_values_in_feature_mask : array-like, shape (n_features,) + Whether or not a feature has missing values. + """ + self.init(X, y, sample_weight, missing_values_in_feature_mask) + cdef class MultiViewSplitterTester(MultiViewSplitter): """A class to expose a Python interface for testing.""" diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py index 508c02d14..409455caa 100644 --- a/sktree/tree/tests/test_multiview.py +++ b/sktree/tree/tests/test_multiview.py @@ -27,11 +27,13 @@ def test_sklearn_compatible_estimator(estimator, check): check(estimator) -@pytest.mark.skip() @pytest.mark.parametrize( - "est", [MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier] + "est, baseline_est", + [ + (MultiViewDecisionTreeClassifier, DecisionTreeClassifier), + (MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier), + ], ) -@pytest.mark.parametrize("baseline_est", [MultiViewDecisionTreeClassifier, DecisionTreeClassifier]) def test_multiview_classification(baseline_est, est): """Test that explicit knowledge of multi-view structure improves classification accuracy. diff --git a/test_mvoblique_tree.py b/test_mvoblique_tree.py deleted file mode 100644 index 1fd1124ee..000000000 --- a/test_mvoblique_tree.py +++ /dev/null @@ -1,55 +0,0 @@ -import math - -import numpy as np -import pytest -from numpy.testing import assert_array_equal -from sklearn.datasets import make_blobs -from sklearn.metrics import accuracy_score -from sklearn.model_selection import cross_val_score -from sklearn.utils.estimator_checks import parametrize_with_checks - -from sktree.tree import ( - DecisionTreeClassifier, - MultiViewDecisionTreeClassifier, - MultiViewObliqueDecisionTreeClassifier, -) - -seed = 12345 - -rng = np.random.default_rng(seed=seed) - -X = np.random.random((20, 10)) -y = np.random.randint(0, 2, size=20) - -# test with max_features as a float -clf = MultiViewDecisionTreeClassifier( - random_state=seed, - feature_set_ends=[6, 10], - max_features=0.5, -) -clf.fit(X, y) - -assert_array_equal(clf.max_features_per_set_, [3, 2]) -assert clf.max_features_ == 5 - -# test with max_features as sqrt -# X = np.random.random((20, 13)) -# clf = MultiViewDecisionTreeClassifier( -# random_state=seed, -# feature_set_ends=[9, 13], -# max_features="sqrt", -# ) -# clf.fit(X, y) -# assert_array_equal(clf.max_features_per_set_, [3, 2]) -# assert clf.max_features_ == 5 - -# # test with max_features as 'sqrt' but not a perfect square -# X = np.random.random((20, 9)) -# clf = MultiViewDecisionTreeClassifier( -# random_state=seed, -# feature_set_ends=[5, 9], -# max_features="sqrt", -# ) -# clf.fit(X, y) -# assert_array_equal(clf.max_features_per_set_, [3, 2]) -# assert clf.max_features_ == 5 From 81493fecea0edd171c1871f1564e05e9a0c9562f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 11:45:28 -0400 Subject: [PATCH 11/26] Working prototype for multiview oblique Signed-off-by: Adam Li --- sktree/tree/_multiview.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index e159bb621..0faaf53d9 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -772,6 +772,14 @@ class MultiViewObliqueDecisionTreeClassifier(SimMatrixMixin, DecisionTreeClassif next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, then this will assume that there is only one feature set. + feature_combinations : float, default=None + The number of feature combinations to consider at each split. + If None, then this will default to the number of features in the + respective feature set. + + cross_feature_set_sampling : bool, default=False + Whether to sample features across feature sets during the oblique splits. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray From c6824b9d71728d26ed5a5dd91199e0ae5a4da4ab Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 12:24:29 -0400 Subject: [PATCH 12/26] Add mvrf Signed-off-by: Adam Li --- doc/whats_new/v0.9.rst | 6 +- sktree/__init__.py | 7 +- sktree/ensemble/__init__.py | 2 +- sktree/ensemble/_multiview.py | 293 +++++++++++++++++++++++++- sktree/tests/test_multiview_forest.py | 7 +- 5 files changed, 310 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index 696929b5e..6a7a4badf 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -17,7 +17,11 @@ Changelog ``apply_max_features_per_feature_set`` argument anymore. Instead, the ``max_features`` argument is used to control the number of features to consider when looking for the best split within each feature set explicitly. - By `Adam Li`_ :pr:`#247`. + By `Adam Li`_ :pr:`#265`. + +- |Feature| :class:`sktree.tree.MultiViewObliqueDecisionTreeClassifier` is implemented + along with its forest version :class:`sktree.ensemble.MultiViewObliqueRandomForestClassifier`. + By `Adam Li`_ :pr:`#265`. Code and Documentation Contributors ----------------------------------- diff --git a/sktree/__init__.py b/sktree/__init__.py index 58636a24c..07dede4d5 100644 --- a/sktree/__init__.py +++ b/sktree/__init__.py @@ -45,7 +45,11 @@ ExtraTreesRegressor, ) from .neighbors import NearestNeighborsMetaEstimator - from .ensemble import ExtendedIsolationForest, MultiViewRandomForestClassifier + from .ensemble import ( + ExtendedIsolationForest, + MultiViewRandomForestClassifier, + MultiViewObliqueRandomForestClassifier, + ) from .ensemble._unsupervised_forest import ( UnsupervisedRandomForest, UnsupervisedObliqueRandomForest, @@ -88,4 +92,5 @@ "ExtraTreesRegressor", "ExtendedIsolationForest", "MultiViewRandomForestClassifier", + "MultiViewObliqueRandomForestClassifier", ] diff --git a/sktree/ensemble/__init__.py b/sktree/ensemble/__init__.py index aa97d0215..15955dc5a 100644 --- a/sktree/ensemble/__init__.py +++ b/sktree/ensemble/__init__.py @@ -1,6 +1,6 @@ from ._eiforest import ExtendedIsolationForest from ._honest_forest import HonestForestClassifier -from ._multiview import MultiViewRandomForestClassifier +from ._multiview import MultiViewObliqueRandomForestClassifier, MultiViewRandomForestClassifier from ._supervised_forest import ( ExtraObliqueRandomForestClassifier, ExtraObliqueRandomForestRegressor, diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py index 828212335..a44767f0d 100644 --- a/sktree/ensemble/_multiview.py +++ b/sktree/ensemble/_multiview.py @@ -1,7 +1,7 @@ from sklearn.utils._param_validation import StrOptions from .._lib.sklearn.ensemble._forest import ForestClassifier -from ..tree import MultiViewDecisionTreeClassifier +from ..tree import MultiViewDecisionTreeClassifier, MultiViewObliqueDecisionTreeClassifier from ..tree._neighbors import SimMatrixMixin from ._extensions import ForestClassifierMixin, ForestMixin @@ -292,3 +292,294 @@ def __init__( self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease + + +class MultiViewObliqueRandomForestClassifier(MultiViewDecisionTreeClassifier): + """ + A multi-view axis-aligned random forest classifier. + + A multi-view random forest is a meta estimator similar to a random + forest that fits a number of multi-view decision tree classifiers + on various sub-samples of the dataset and uses averaging to + improve the predictive accuracy and control over-fitting. + + Parameters + ---------- + n_estimators : int, default=100 + The number of trees in the forest. + + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + Note: this parameter is tree-specific. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : {"sqrt", "log2", None}, int or float, default="sqrt" + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `round(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + max_leaf_nodes : int, default=None + Grow trees with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + bootstrap : bool, default=True + Whether bootstrap samples are used when building trees. If False, the + whole dataset is used to build each tree. + + oob_score : bool, default=False + Whether to use out-of-bag samples to estimate the generalization score. + Only available if bootstrap=True. + + n_jobs : int, default=None + The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`, + :meth:`decision_path` and :meth:`apply` are all parallelized over the + trees. ``None`` means 1 unless in a `joblib.parallel_backend` + context. ``-1`` means using all processors. See :term:`Glossary + ` for more details. + + random_state : int, RandomState instance or None, default=None + Controls both the randomness of the bootstrapping of the samples used + when building trees (if ``bootstrap=True``) and the sampling of the + features to consider when looking for the best split at each node + (if ``max_features < n_features``). + See :term:`Glossary ` for details. + + verbose : int, default=0 + Controls the verbosity when fitting and predicting. + + warm_start : bool, default=False + When set to ``True``, reuse the solution of the previous call to fit + and add more estimators to the ensemble, otherwise, just fit a whole + new forest. See :term:`the Glossary `. + + class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \ + default=None + Weights associated with classes in the form ``{class_label: weight}``. + If not given, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + The "balanced_subsample" mode is the same as "balanced" except that + weights are computed based on the bootstrap sample for every tree + grown. + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + + max_samples : int or float, default=None + If bootstrap is True, the number of samples to draw from X + to train each base estimator. + + - If None (default), then draw `X.shape[0]` samples. + - If int, then draw `max_samples` samples. + - If float, then draw `max_samples * X.shape[0]` samples. Thus, + `max_samples` should be in the interval `(0.0, 1.0]`. + + feature_set_ends : array-like of int of shape (n_feature_sets,), default=None + The indices of the end of each feature set. For example, if the first + feature set is the first 10 features, and the second feature set is the + next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, + then this will assume that there is only one feature set. + + Attributes + ---------- + estimators_ : list of sktree.tree.ObliqueDecisionTreeClassifier + The collection of fitted sub-estimators. + + classes_ : ndarray of shape (n_classes,) or a list of such arrays + The classes labels (single output problem), or a list of arrays of + class labels (multi-output problem). + + n_classes_ : int or list + The number of classes (single output problem), or a list containing the + number of classes for each output (multi-output problem). + + n_features_ : int + The number of features when ``fit`` is performed. + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + oob_score_ : float + Score of the training dataset obtained using an out-of-bag estimate. + This attribute exists only when ``oob_score`` is True. + + oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \ + (n_samples, n_classes, n_outputs) + Decision function computed with out-of-bag estimate on the training + set. If n_estimators is small it might be possible that a data point + was never left out during the bootstrap. In this case, + `oob_decision_function_` might contain NaN. This attribute exists + only when ``oob_score`` is True. + + See Also + -------- + sktree.tree.ObliqueDecisionTreeClassifier : An oblique decision + tree classifier. + sklearn.ensemble.RandomForestClassifier : An axis-aligned decision + forest classifier. + """ + + tree_type = "oblique" + _parameter_constraints: dict = { + **MultiViewObliqueDecisionTreeClassifier._parameter_constraints, + "class_weight": [ + StrOptions({"balanced_subsample", "balanced"}), + dict, + list, + None, + ], + } + _parameter_constraints.pop("splitter") + + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_set_ends=None, + feature_combinations=None, + cross_feature_set_sampling=False, + ): + super().__init__( + estimator=MultiViewObliqueDecisionTreeClassifier(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "feature_set_ends", + "feature_combinations", + "cross_feature_set_sampling", + ), + bootstrap=bootstrap, + oob_score=oob_score, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + warm_start=warm_start, + class_weight=class_weight, + max_samples=max_samples, + ) + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.max_features = max_features + self.feature_set_ends = feature_set_ends + self.feature_combinations = feature_combinations + self.cross_feature_set_sampling = cross_feature_set_sampling + + # unused by oblique forests + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_leaf_nodes = max_leaf_nodes + self.min_impurity_decrease = min_impurity_decrease diff --git a/sktree/tests/test_multiview_forest.py b/sktree/tests/test_multiview_forest.py index da168bbba..d1a66a380 100644 --- a/sktree/tests/test_multiview_forest.py +++ b/sktree/tests/test_multiview_forest.py @@ -6,7 +6,11 @@ from sklearn.model_selection import cross_val_score, train_test_split from sklearn.utils.estimator_checks import parametrize_with_checks -from sktree import MultiViewRandomForestClassifier, RandomForestClassifier +from sktree import ( + MultiViewObliqueRandomForestClassifier, + MultiViewRandomForestClassifier, + RandomForestClassifier, +) from sktree.datasets.multiview import make_joint_factor_model seed = 12345 @@ -15,6 +19,7 @@ @parametrize_with_checks( [ MultiViewRandomForestClassifier(random_state=12345, n_estimators=10), + MultiViewObliqueRandomForestClassifier(random_state=12345, n_estimators=10), ] ) def test_sklearn_compatible_estimator(estimator, check): From f546194db4477f2722b2a744e627bfe2324e53d0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 12:57:05 -0400 Subject: [PATCH 13/26] Enable multiview oblique rf tests Signed-off-by: Adam Li --- sktree/ensemble/_multiview.py | 10 +++++++++- sktree/tests/test_multiview_forest.py | 19 +++++++++++++++---- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py index a44767f0d..4f85ddbd3 100644 --- a/sktree/ensemble/_multiview.py +++ b/sktree/ensemble/_multiview.py @@ -294,7 +294,7 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease -class MultiViewObliqueRandomForestClassifier(MultiViewDecisionTreeClassifier): +class MultiViewObliqueRandomForestClassifier(MultiViewRandomForestClassifier): """ A multi-view axis-aligned random forest classifier. @@ -451,6 +451,14 @@ class MultiViewObliqueRandomForestClassifier(MultiViewDecisionTreeClassifier): next 20 features, then ``feature_set_ends = [10, 30]``. If ``None``, then this will assume that there is only one feature set. + feature_combinations : float, default=None + The number of feature combinations to consider at each split. + If None, then this will default to the number of features in the + respective feature set. + + cross_feature_set_sampling : bool, default=False + Whether to sample features across feature sets during the oblique splits. + Attributes ---------- estimators_ : list of sktree.tree.ObliqueDecisionTreeClassifier diff --git a/sktree/tests/test_multiview_forest.py b/sktree/tests/test_multiview_forest.py index d1a66a380..1a2e1be1c 100644 --- a/sktree/tests/test_multiview_forest.py +++ b/sktree/tests/test_multiview_forest.py @@ -26,8 +26,18 @@ def test_sklearn_compatible_estimator(estimator, check): check(estimator) -@pytest.mark.parametrize("baseline_est", [RandomForestClassifier]) -def test_multiview_classification(baseline_est): +@pytest.mark.parametrize( + "mv_est, kwargs", + [ + (MultiViewRandomForestClassifier, dict()), + (MultiViewObliqueRandomForestClassifier, dict(feature_combinations=2)), + ( + MultiViewObliqueRandomForestClassifier, + dict(feature_combinations=2, cross_feature_set_sampling=True), + ), + ], +) +def test_multiview_classification(mv_est, kwargs): """Test that explicit knowledge of multi-view structure improves classification accuracy. In very high-dimensional noise setting across two views, when the max_depth and max_features @@ -66,12 +76,13 @@ def test_multiview_classification(baseline_est): y = np.hstack((y0, y1)).T # Compare multiview decision tree vs single-view decision tree - clf = MultiViewRandomForestClassifier( + clf = mv_est( random_state=seed, feature_set_ends=[n_features_1, X.shape[1]], max_features="sqrt", max_depth=4, n_estimators=n_estimators, + **kwargs, ) clf.fit(X, y) assert ( @@ -81,7 +92,7 @@ def test_multiview_classification(baseline_est): cross_val_score(clf, X, y, cv=5).mean() == 1.0 ), f"CV score: {cross_val_score(clf, X, y, cv=5).mean()}" - clf = baseline_est( + clf = RandomForestClassifier( random_state=seed, max_depth=4, max_features="sqrt", From a1c6313a34e81e1d30a09ca161426936c9887a1c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 12:58:13 -0400 Subject: [PATCH 14/26] Add to api.rst Signed-off-by: Adam Li --- doc/api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/api.rst b/doc/api.rst index 0c112b384..fb3c24d13 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -66,6 +66,7 @@ how scikit-learn builds trees. PatchObliqueRandomForestRegressor HonestForestClassifier MultiViewRandomForestClassifier + MultiViewObliqueRandomForestClassifier .. currentmodule:: sktree.tree .. autosummary:: @@ -77,6 +78,7 @@ how scikit-learn builds trees. PatchObliqueDecisionTreeRegressor HonestTreeClassifier MultiViewDecisionTreeClassifier + MultiViewObliqueDecisionTreeClassifier Unsupervised ------------ From 48997cfaabcc9d5d0a57f29e89cf089f72cd74fe Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 13:51:43 -0400 Subject: [PATCH 15/26] Add to api.rst Signed-off-by: Adam Li --- sktree/ensemble/_multiview.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sktree/ensemble/_multiview.py b/sktree/ensemble/_multiview.py index 4f85ddbd3..9fbdd6d88 100644 --- a/sktree/ensemble/_multiview.py +++ b/sktree/ensemble/_multiview.py @@ -294,7 +294,9 @@ def __init__( self.min_impurity_decrease = min_impurity_decrease -class MultiViewObliqueRandomForestClassifier(MultiViewRandomForestClassifier): +class MultiViewObliqueRandomForestClassifier( + SimMatrixMixin, ForestClassifierMixin, ForestMixin, ForestClassifier +): """ A multi-view axis-aligned random forest classifier. From ef1dc6bf705e579737906d144255f5a807183f80 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 14:33:07 -0400 Subject: [PATCH 16/26] Fix unit tests Signed-off-by: Adam Li --- sktree/tree/_multiview.py | 2 +- sktree/tree/_oblique_splitter.pyx | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index 0faaf53d9..b0becd559 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -493,7 +493,7 @@ def _update_tree(self, X, y, sample_weight): # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth - monotonic_cst = self.monotonic_cst_ + monotonic_cst = None # Build tree # Note: this reconstructs the builder with the same state it had during the diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index d52d760c1..7982c1de9 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -849,6 +849,23 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): self._max_feature_combinations = ceil(self.feature_combinations) self.cross_feature_set_sampling = cross_feature_set_sampling + def __reduce__(self): + """Enable pickling the splitter.""" + return (type(self), + ( + self.criterion, + self.max_features, + self.min_samples_leaf, + self.min_weight_leaf, + self.random_state, + self.monotonic_cst.base if self.monotonic_cst is not None else None, + self.feature_combinations, + self.feature_set_ends.base if self.feature_set_ends is not None else None, + self.n_feature_sets, + self.max_features_per_set.base if self.max_features_per_set is not None else None, + self.cross_feature_set_sampling, + ), self.__getstate__()) + cdef int init( self, object X, From 9dbac9b467ea58e3c56f412ab8c0af75de14862f Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 16:13:54 -0400 Subject: [PATCH 17/26] Fix unit tests Signed-off-by: Adam Li --- sktree/stats/tests/test_forestht.py | 3 -- sktree/tree/_multiview.py | 79 +++++++++++++++++++++++++++-- sktree/tree/tests/test_multiview.py | 4 +- 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index 0e08a3e5b..e2dbe20ed 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -83,7 +83,6 @@ def test_small_dataset_independent(seed): stratify=True, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=feature_set_ends, - apply_max_features_per_feature_set=True, ), ) perm_clf = PermutationHonestForestClassifier( @@ -97,7 +96,6 @@ def test_small_dataset_independent(seed): stratify=True, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=feature_set_ends, - apply_max_features_per_feature_set=True, ), ) result = build_coleman_forest( @@ -208,7 +206,6 @@ def test_comight_repeated_feature_sets(seed): stratify=True, tree_estimator=MultiViewDecisionTreeClassifier( feature_set_ends=feature_set_ends, - apply_max_features_per_feature_set=True, ), ) diff --git a/sktree/tree/_multiview.py b/sktree/tree/_multiview.py index b0becd559..f4d4c5207 100644 --- a/sktree/tree/_multiview.py +++ b/sktree/tree/_multiview.py @@ -493,8 +493,6 @@ def _update_tree(self, X, y, sample_weight): # set decision-tree model parameters max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth - monotonic_cst = None - # Build tree # Note: this reconstructs the builder with the same state it had during the # initial fit. This is necessary because the builder is not saved as part @@ -524,7 +522,7 @@ def _update_tree(self, X, y, sample_weight): min_samples_leaf, min_weight_leaf, random_state, - monotonic_cst, + self.monotonic_cst_, self._feature_combinations_, self.feature_set_ends_, self.n_feature_sets_, @@ -951,6 +949,7 @@ def _build_tree( Controls the randomness of the estimator. """ monotonic_cst = None + self.monotonic_cst_ = monotonic_cst _, n_features = X.shape self.feature_combinations_ = ( @@ -1160,3 +1159,77 @@ def _inheritable_fitted_attribute(self): "max_features_per_set_", "feature_combinations_", ] + + def _update_tree(self, X, y, sample_weight): + # Update tree + max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes + min_samples_split = self.min_samples_split_ + min_samples_leaf = self.min_samples_leaf_ + min_weight_leaf = self.min_weight_leaf_ + # set decision-tree model parameters + max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth + + # Build tree + # Note: this reconstructs the builder with the same state it had during the + # initial fit. This is necessary because the builder is not saved as part + # of the class, and thus the state may be lost if pickled/unpickled. + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self._n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + random_state = check_random_state(self.random_state) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + SPLITTERS = OBLIQUE_DENSE_SPLITTERS + if not isinstance(self.splitter, ObliqueSplitter): + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + self.monotonic_cst_, + self.feature_combinations_, + self.feature_set_ends_, + self.n_feature_sets_, + self.max_features_per_set_, + self.cross_feature_set_sampling, + ) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + self.store_leaf_values, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + self.store_leaf_values, + ) + builder.initialize_node_queue(self.tree_, X, y, sample_weight) + builder.build(self.tree_, X, y, sample_weight) + + self._prune_tree() + return self diff --git a/sktree/tree/tests/test_multiview.py b/sktree/tree/tests/test_multiview.py index 409455caa..18a0e0a3e 100644 --- a/sktree/tree/tests/test_multiview.py +++ b/sktree/tree/tests/test_multiview.py @@ -182,7 +182,7 @@ def test_multiview_separate_feature_set_sampling_is_consistent(): X = rng.standard_normal(size=(20, 10)) y = rng.integers(0, 2, size=20) - # test with max_features as an array but apply_max_features is off + # test with max_features as an array clf = MultiViewDecisionTreeClassifier( random_state=seed, feature_set_ends=[1, 3, 6, 10], @@ -195,7 +195,7 @@ def test_multiview_separate_feature_set_sampling_is_consistent(): assert_array_equal(clf.max_features_per_set_, [1, 2, 2, 3]) assert clf.max_features_ == np.sum(clf.max_features_per_set_), np.sum(clf.max_features_per_set_) - # test with max_features as an array but apply_max_features is off + # multiview feature set should be consistent across tres other_clf = MultiViewDecisionTreeClassifier( random_state=seed, feature_set_ends=[1, 3, 6, 10], From 7aeb6a61cfa2c8bd91360c145e8904d41712ec68 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 16:14:44 -0400 Subject: [PATCH 18/26] Fix unit tests Signed-off-by: Adam Li --- benchmarks_nonasv/bench_forestht.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/benchmarks_nonasv/bench_forestht.py b/benchmarks_nonasv/bench_forestht.py index 2bf0e6926..59e4dff9b 100644 --- a/benchmarks_nonasv/bench_forestht.py +++ b/benchmarks_nonasv/bench_forestht.py @@ -13,8 +13,6 @@ import seaborn as sns from scipy.special import expit -# using an outdated API, but the code could get refactored to use our new API -# build_coleman_forest, build_oob_forest, etc. from sktree.stats import PermutationForestClassifier, PermutationForestRegressor seed = 12345 From 64edda39e7675626d3cbd871adc98a8086de1c06 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 16:19:11 -0400 Subject: [PATCH 19/26] Remove runtime checks in cython Signed-off-by: Adam Li --- sktree/tree/_oblique_splitter.pxd | 1 - sktree/tree/_oblique_splitter.pyx | 41 +++++++------------------------ 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/sktree/tree/_oblique_splitter.pxd b/sktree/tree/_oblique_splitter.pxd index aea477043..6ad086d0f 100644 --- a/sktree/tree/_oblique_splitter.pxd +++ b/sktree/tree/_oblique_splitter.pxd @@ -165,7 +165,6 @@ cdef class MultiViewSplitter(BestObliqueSplitter): # XXX: This splitter is experimental. Expect changes frequently. cdef class MultiViewObliqueSplitter(MultiViewSplitter): - # cdef const intp_t[:] n_non_zeros_per_set # the number of non-zero features in each feature set cdef intp_t _max_feature_combinations # Number of non-zero features to sample per projection matrix cdef bint cross_feature_set_sampling # Whether we sample across feature set when creating a projection vector diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index 7982c1de9..f21b7e1f5 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -1,8 +1,8 @@ # distutils: language=c++ # cython: language_level=3 -# cython: boundscheck=True -# cython: wraparound=True -# cython: initializedcheck=True +# cython: boundscheck=False +# cython: wraparound=False +# cython: initializedcheck=False import numpy as np @@ -813,8 +813,7 @@ cdef class MultiViewSplitter(BestObliqueSplitter): if proj_i >= self.max_features: break -# TODO: need to check segfault for multiview oblique splitter -# REBUILD WITH BOUNDS CHECK + cdef class MultiViewObliqueSplitter(MultiViewSplitter): def __cinit__( self, @@ -839,14 +838,12 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): # replaces usage of max_features self.max_features_per_set = max_features_per_set - # compute # of non-zeros expected on average per feature set - # cdef intp_t[:] n_non_zeros_per_set = np.zeros(self.n_feature_sets, dtype=np.intp) - # cdef intp_t i - # for i in range(self.n_feature_sets): - # n_non_zeros_per_set[i] = (self.max_features_per_set[i] * self.feature_combinations) - # self.n_non_zeros_per_set = n_non_zeros_per_set - + # each projection vector (i.e. mtry) of each feature set will sample a feature combination of + # 1 to "max feature combinations" number of features. self._max_feature_combinations = ceil(self.feature_combinations) + + # with cross-feature-set sampling, the projection vector can combine different + # feature sets self.cross_feature_set_sampling = cross_feature_set_sampling def __reduce__(self): @@ -876,26 +873,6 @@ cdef class MultiViewObliqueSplitter(MultiViewSplitter): Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) self.X = X - - # create a helper array for allowing efficient Fisher-Yates - self.multi_indices_to_sample = vector[vector[intp_t]](self.n_feature_sets) - - # Here, we sample the indices of the features to sample in each feature set - # as a separate vector. This is done to allow for efficient Fisher-Yates - # shuffling of the indices, such that we randomly sample features to consider, but within - # each feature set separately. This ensures that the sampled projection matrix consists of - # a balanced number of features from each feature set. - # - # Example: - # multi_indices_to_sample[0] = [0, 1, 2, 3] - # multi_indices_to_sample[1] = [4, 5] - # which corresponds to a feature set with 4 features and another with 2 features. - # for i_feature in range(self.n_feature_sets): - # size_of_feature_set = self.feature_set_ends[i_feature] - feature_set_begin - # for ifeat in range(size_of_feature_set): - # self.multi_indices_to_sample[i_feature].push_back(ifeat + feature_set_begin) - # feature_set_begin = self.feature_set_ends[i_feature] - return 0 cdef void sample_proj_mat( From 65b0f305476ab275cd9b56b7443ad61b69a691b6 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Fri, 5 Jul 2024 16:44:00 -0400 Subject: [PATCH 20/26] Fix docs Signed-off-by: Adam Li --- doc/whats_new/v0.9.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v0.9.rst b/doc/whats_new/v0.9.rst index 6a7a4badf..41a920383 100644 --- a/doc/whats_new/v0.9.rst +++ b/doc/whats_new/v0.9.rst @@ -20,7 +20,7 @@ Changelog By `Adam Li`_ :pr:`#265`. - |Feature| :class:`sktree.tree.MultiViewObliqueDecisionTreeClassifier` is implemented - along with its forest version :class:`sktree.ensemble.MultiViewObliqueRandomForestClassifier`. + along with its forest version :class:`sktree.MultiViewObliqueRandomForestClassifier`. By `Adam Li`_ :pr:`#265`. Code and Documentation Contributors From aafeb69af9add5e132a65b2c372a97dc06c5493c Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 10:54:21 -0400 Subject: [PATCH 21/26] Removing Signed-off-by: Adam Li --- .gitignore | 1 + treeple/_lib/sklearn_fork | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 160000 treeple/_lib/sklearn_fork diff --git a/.gitignore b/.gitignore index f48bdcdb2..63dc5abe0 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ coverage commit.txt treeple/_lib/sklearn/ +treeple/_lib/sklearn_fork/ *.png _data diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork deleted file mode 160000 index d455aa16e..000000000 --- a/treeple/_lib/sklearn_fork +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d455aa16ee9cc42ce342dd07d9b94db117783fcc From a003d5a23ab6fda325ca9f37e5c4c54e1a55fe01 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 10:56:56 -0400 Subject: [PATCH 22/26] Fix Signed-off-by: Adam Li --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 63dc5abe0..f48bdcdb2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,6 @@ coverage commit.txt treeple/_lib/sklearn/ -treeple/_lib/sklearn_fork/ *.png _data From c1f9257dde1e0098f0b0f89a77d4346c5d5edbbe Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 10:59:15 -0400 Subject: [PATCH 23/26] New submodule Signed-off-by: Adam Li --- treeple/_lib/sklearn_fork | 1 + 1 file changed, 1 insertion(+) create mode 160000 treeple/_lib/sklearn_fork diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork new file mode 160000 index 000000000..d455aa16e --- /dev/null +++ b/treeple/_lib/sklearn_fork @@ -0,0 +1 @@ +Subproject commit d455aa16ee9cc42ce342dd07d9b94db117783fcc From b0e0dffc2f77c4e52f319d07d68f8821a932ea2a Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 11:32:44 -0400 Subject: [PATCH 24/26] Fix import Signed-off-by: Adam Li --- treeple/ensemble/_multiview.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/treeple/ensemble/_multiview.py b/treeple/ensemble/_multiview.py index afc45ae86..aabfc2324 100644 --- a/treeple/ensemble/_multiview.py +++ b/treeple/ensemble/_multiview.py @@ -463,7 +463,7 @@ class MultiViewObliqueRandomForestClassifier( Attributes ---------- - estimators_ : list of sktree.tree.ObliqueDecisionTreeClassifier + estimators_ : list of treeple.tree.ObliqueDecisionTreeClassifier The collection of fitted sub-estimators. classes_ : ndarray of shape (n_classes,) or a list of such arrays @@ -512,7 +512,7 @@ class labels (multi-output problem). See Also -------- - sktree.tree.ObliqueDecisionTreeClassifier : An oblique decision + treeple.tree.ObliqueDecisionTreeClassifier : An oblique decision tree classifier. sklearn.ensemble.RandomForestClassifier : An axis-aligned decision forest classifier. From 392a729dddfbedc2699f5f9a472b1c0662c77ae1 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 11:35:19 -0400 Subject: [PATCH 25/26] Fix import Signed-off-by: Adam Li --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index faf27c35d..20c8c81b1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,5 +83,5 @@ repos: - tomli files: ^(?!doc/use\.rst$).*\.(rst|inc)$ -ci: - autofix_prs: true +# ci: +# autofix_prs: true From 347dddb3596c22999fdfd5c1a4baac9a2d6368a0 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 9 Jul 2024 11:35:36 -0400 Subject: [PATCH 26/26] Fix import Signed-off-by: Adam Li --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 20c8c81b1..faf27c35d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -83,5 +83,5 @@ repos: - tomli files: ^(?!doc/use\.rst$).*\.(rst|inc)$ -# ci: -# autofix_prs: true +ci: + autofix_prs: true