Skip to content

Commit

Permalink
MAINT Fix builder partial fit (#62)
Browse files Browse the repository at this point in the history
#### Reference Issues/PRs
Fixes state of builder_ to not need to be maintained.

Prolly needs unit-tests to determine if this "functions as desired".
I.e.
- changing datatype of X over multiple partial fits should fail nicely, 
- changing datatype of y
- classification and regression

#### What does this implement/fix? Explain your changes.


#### Any other comments?


<!--
Please be aware that we are a loose team of volunteers so patience is
necessary; assistance handling other issues is very welcome. We value
all user contributions, no matter how minor they are. If we are slow to
review, either the pull request needs some benchmarking, tinkering,
convincing, etc. or more likely the reviewers are simply busy. In either
case, we ask for your understanding during the review process.
For more information, see our FAQ on this topic:

https://scikit-learn.org/dev/faq.html#why-is-my-pull-request-not-getting-any-attention.

Thanks for contributing!
-->

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 authored Apr 2, 2024
1 parent f360749 commit e020253
Showing 1 changed file with 105 additions and 20 deletions.
125 changes: 105 additions & 20 deletions sklearn/tree/_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ def _fit(
)

self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)

self._n_classes_ = self.n_classes_
if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
y = np.ascontiguousarray(y, dtype=DOUBLE)

Expand Down Expand Up @@ -377,6 +377,7 @@ def _fit(
min_samples_split = max(2, min_samples_split)
min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
self.min_samples_split_ = min_samples_split
self.min_samples_leaf_ = min_samples_leaf

if isinstance(self.max_features, str):
if self.max_features == "sqrt":
Expand Down Expand Up @@ -411,6 +412,7 @@ def _fit(
min_weight_leaf = self.min_weight_fraction_leaf * n_samples
else:
min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
self.min_weight_leaf_ = min_weight_leaf

# build the actual tree now with the parameters
self = self._build_tree(
Expand Down Expand Up @@ -521,6 +523,7 @@ def _build_tree(
# Since self.monotonic_cst encodes constraints on probabilities of the
# *positive class*, all signs must be flipped.
monotonic_cst *= -1
self.monotonic_cst_ = monotonic_cst

if not isinstance(self.splitter, BaseSplitter):
splitter = SPLITTERS[self.splitter](
Expand All @@ -544,7 +547,7 @@ def _build_tree(

# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
if max_leaf_nodes < 0:
self.builder_ = DepthFirstTreeBuilder(
builder = DepthFirstTreeBuilder(
splitter,
min_samples_split,
min_samples_leaf,
Expand All @@ -554,7 +557,7 @@ def _build_tree(
self.store_leaf_values,
)
else:
self.builder_ = BestFirstTreeBuilder(
builder = BestFirstTreeBuilder(
splitter,
min_samples_split,
min_samples_leaf,
Expand All @@ -564,9 +567,7 @@ def _build_tree(
self.min_impurity_decrease,
self.store_leaf_values,
)
self.builder_.build(
self.tree_, X, y, sample_weight, missing_values_in_feature_mask
)
builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)

if self.n_outputs_ == 1 and is_classifier(self):
self.n_classes_ = self.n_classes_[0]
Expand Down Expand Up @@ -1128,12 +1129,18 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
for basic usage of these attributes.
builder_ : TreeBuilder instance
The underlying TreeBuilder object.
min_samples_split_ : float
The minimum number of samples needed to split a node in the tree building.
min_weight_leaf_ : float
The minimum number of weighted samples in a leaf.
min_samples_leaf_ : int
The minimum number of samples needed for a leaf node.
monotonic_cst_ : array-like of int of shape (n_features,)
The monotonicity constraints enforced on each feature.
See Also
--------
DecisionTreeRegressor : A decision tree regressor.
Expand Down Expand Up @@ -1369,8 +1376,68 @@ def partial_fit(self, X, y, sample_weight=None, check_input=True, classes=None):
y = np.ascontiguousarray(y, dtype=DOUBLE)

# Update tree
self.builder_.initialize_node_queue(self.tree_, X, y, sample_weight)
self.builder_.build(self.tree_, X, y, sample_weight)
max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
min_samples_split = self.min_samples_split_
min_samples_leaf = self.min_samples_leaf_
min_weight_leaf = self.min_weight_leaf_
# set decision-tree model parameters
max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth

monotonic_cst = self.monotonic_cst_

# Build tree
# Note: this reconstructs the builder with the same state it had during the
# initial fit. This is necessary because the builder is not saved as part
# of the class, and thus the state may be lost if pickled/unpickled.
n_samples = X.shape[0]
criterion = self.criterion
if not isinstance(criterion, BaseCriterion):
if is_classifier(self):
criterion = CRITERIA_CLF[self.criterion](
self.n_outputs_, self._n_classes_
)
else:
criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
else:
# Make a deepcopy in case the criterion has mutable attributes that
# might be shared and modified concurrently during parallel fitting
criterion = copy.deepcopy(criterion)

random_state = check_random_state(self.random_state)
SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
splitter = SPLITTERS[self.splitter](
criterion,
self.max_features_,
min_samples_leaf,
min_weight_leaf,
random_state,
monotonic_cst,
)

# Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
if max_leaf_nodes < 0:
builder = DepthFirstTreeBuilder(
splitter,
min_samples_split,
min_samples_leaf,
min_weight_leaf,
max_depth,
self.min_impurity_decrease,
self.store_leaf_values,
)
else:
builder = BestFirstTreeBuilder(
splitter,
min_samples_split,
min_samples_leaf,
min_weight_leaf,
max_depth,
max_leaf_nodes,
self.min_impurity_decrease,
self.store_leaf_values,
)
builder.initialize_node_queue(self.tree_, X, y, sample_weight)
builder.build(self.tree_, X, y, sample_weight)

self._prune_tree()

Expand Down Expand Up @@ -1637,12 +1704,18 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
for basic usage of these attributes.
builder_ : TreeBuilder instance
The underlying TreeBuilder object.
min_samples_split_ : float
The minimum number of samples needed to split a node in the tree building.
min_weight_leaf_ : float
The minimum number of weighted samples in a leaf.
monotonic_cst_ : array-like of int of shape (n_features,)
The monotonicity constraints enforced on each feature.
min_samples_leaf_ : int
The minimum number of samples needed for a leaf node.
See Also
--------
DecisionTreeClassifier : A decision tree classifier.
Expand Down Expand Up @@ -2022,12 +2095,18 @@ class ExtraTreeClassifier(DecisionTreeClassifier):
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
for basic usage of these attributes.
builder_ : TreeBuilder instance
The underlying TreeBuilder object.
min_samples_split_ : float
The minimum number of samples needed to split a node in the tree building.
min_weight_leaf_ : float
The minimum number of weighted samples in a leaf.
monotonic_cst_ : array-like of int of shape (n_features,)
The monotonicity constraints enforced on each feature.
min_samples_leaf_ : int
The minimum number of samples needed for a leaf node.
See Also
--------
ExtraTreeRegressor : An extremely randomized tree regressor.
Expand Down Expand Up @@ -2290,12 +2369,18 @@ class ExtraTreeRegressor(DecisionTreeRegressor):
:ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
for basic usage of these attributes.
builder_ : TreeBuilder instance
The underlying TreeBuilder object.
min_samples_split_ : float
The minimum number of samples needed to split a node in the tree building.
min_weight_leaf_ : float
The minimum number of weighted samples in a leaf.
monotonic_cst_ : array-like of int of shape (n_features,)
The monotonicity constraints enforced on each feature.
min_samples_leaf_ : int
The minimum number of samples needed for a leaf node.
See Also
--------
ExtraTreeClassifier : An extremely randomized tree classifier.
Expand Down

0 comments on commit e020253

Please sign in to comment.