[FIX] Adding fix to allow any kind of tree for honesttree (#158)

* Adding fix to allow any kind of tree for honesttree --------- Signed-off-by: Adam Li <adam2392@gmail.com>
neurodata · Oct 27, 2023 · d6d0b16 · d6d0b16
1 parent a81c597
commit d6d0b16
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 17 deletions.
diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst
@@ -17,6 +17,7 @@ Changelog
 - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
 - |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`)
 - |Fix| Fixed usage of ``feature_importances_`` property in ``HonestForestClassifier``, by `Adam Li`_ (:pr:`156`)
+- |Fix| Fixed ``HonestForestClassifier`` to allow decision-trees from sklearn, albeit with a limited API, by `Adam Li`_ (:pr:`158`)
 
 Code and Documentation Contributors
 -----------------------------------

diff --git a/sktree/tree/_honest_tree.py b/sktree/tree/_honest_tree.py
@@ -4,7 +4,6 @@
 import numpy as np
 from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone
 from sklearn.ensemble._base import _set_random_states
-from sklearn.tree._classes import BaseDecisionTree as skBaseDecisionTree
 from sklearn.utils.multiclass import _check_partial_fit_first_call, check_classification_targets
 from sklearn.utils.validation import check_is_fitted, check_X_y
 
@@ -555,9 +554,10 @@ def _fit(
                 store_leaf_values=self.store_leaf_values,
             )
         else:
+            # XXX: Remove this?
             # we throw an error if the user is using trees from sklearn:main
-            if isinstance(self.tree_estimator, skBaseDecisionTree):
-                raise RuntimeError("Instead of using sklearn.tree, use trees import from sktree.")
+            # if isinstance(self.tree_estimator, skBaseDecisionTree):
+            #     raise RuntimeError("Instead of using sklearn.tree, use trees import from sktree.")
 
             # XXX: maybe error out if the tree_estimator is already fitted
             self.estimator_ = clone(self.tree_estimator)
@@ -575,23 +575,40 @@ def _fit(
                     random_state=self.random_state,
                     min_impurity_decrease=self.min_impurity_decrease,
                     ccp_alpha=self.ccp_alpha,
-                    monotonic_cst=self.monotonic_cst,
-                    store_leaf_values=self.store_leaf_values,
                 )
             )
+            try:
+                self.estimator_.set_params(**dict(monotonic_cst=self.monotonic_cst))
+                self.estimator_.set_params(
+                    **dict(
+                        store_leaf_values=self.store_leaf_values,
+                    )
+                )
+            except Exception:
+                print("Using sklearn tree")
 
             if self.random_state is not None:
                 _set_random_states(self.estimator_, self.random_state)
 
         # Learn structure on subsample
-        self.estimator_._fit(
-            X,
-            y,
-            sample_weight=_sample_weight,
-            check_input=check_input,
-            missing_values_in_feature_mask=missing_values_in_feature_mask,
-            classes=classes,
-        )
+        # XXX: this allows us to use BaseDecisionTree without partial_fit API
+        try:
+            self.estimator_._fit(
+                X,
+                y,
+                sample_weight=_sample_weight,
+                check_input=check_input,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+                classes=classes,
+            )
+        except Exception:
+            self.estimator_._fit(
+                X,
+                y,
+                sample_weight=_sample_weight,
+                check_input=check_input,
+                missing_values_in_feature_mask=missing_values_in_feature_mask,
+            )
         self._inherit_estimator_attributes()
 
         # update the number of classes, unsplit

diff --git a/sktree/tree/tests/test_honest_tree.py b/sktree/tree/tests/test_honest_tree.py
@@ -2,6 +2,7 @@
 import pytest
 from sklearn import datasets
 from sklearn.metrics import accuracy_score
+from sklearn.model_selection import cross_val_score
 from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
@@ -120,11 +121,117 @@ def test_sklearn_compatible_estimator(estimator, check):
     check(estimator)
 
 
-def test_error_with_sklearn_trees():
+def test_with_sklearn_trees():
     X = np.ones((20, 4))
     X[10:] *= -1
     y = [0] * 10 + [1] * 10
 
-    with pytest.raises(RuntimeError, match="Instead of using sklearn.tree"):
-        clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier())
-        clf.fit(X, y)
+    # with pytest.raises(RuntimeError, match="Instead of using sklearn.tree"):
+    clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier())
+    clf.fit(X, y)
+
+
+def k_sample_transform(inputs, test_type="normal"):
+    """
+    Computes a `k`-sample transform of the inputs.
+
+    For :math:`k` groups, this creates two matrices, the first vertically stacks the
+    inputs.
+    In order to use this function, the inputs must have the same number of dimensions
+    :math:`p` and can have varying number of samples :math:`n`. The second output is a
+    label
+    matrix the one-hoc encodes the groups. The outputs are thus ``(N, p)`` and
+    ``(N, k)`` where `N` is the total number of samples. In the case where the test
+    a random forest based tests, it creates a ``(N, 1)`` where the entries are
+    varlues from 1 to :math:`k` based on the number of samples.
+
+    Parameters
+    ----------
+    inputs : list of ndarray
+        A list of the inputs. All inputs must be ``(n, p)`` where `n` is the number
+        of samples and `p` is the number of dimensions. `n` can vary between samples,
+        but `p` must be the same among all the samples.
+    test_type : {"normal", "rf"}, default: "normal"
+        Whether to one-hoc encode the inputs ("normal") or use a one-dimensional
+        categorical encoding ("rf").
+
+    Returns
+    -------
+    u : ndarray
+        The matrix of concatenated inputs of shape ``(N, p)``.
+    v : ndarray
+        The label matrix of shape ``(N, k)`` ("normal") or ``(N, 1)`` ("rf").
+    """
+    n_inputs = len(inputs)
+    u = np.vstack(inputs)
+    if np.var(u) == 0:
+        raise ValueError("Test cannot be run, the inputs have 0 variance")
+
+    if test_type == "rf":
+        v = np.vstack([np.repeat(i, inputs[i].shape[0]).reshape(-1, 1) for i in range(n_inputs)])
+    elif test_type == "normal":
+        if n_inputs == 2:
+            n1 = inputs[0].shape[0]
+            n2 = inputs[1].shape[0]
+            v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+        else:
+            vs = []
+            for i in range(n_inputs):
+                n = inputs[i].shape[0]
+                encode = np.zeros(shape=(n, n_inputs))
+                encode[:, i] = np.ones(shape=n)
+                vs.append(encode)
+            v = np.concatenate(vs)
+    else:
+        raise ValueError("test_type must be normal or rf")
+
+    return u, v
+
+
+@pytest.mark.skip()
+def test_sklearn_tree_regression():
+    """Test against regression in power-curves discussed in:"""
+
+    def quadratic(n, p, noise=False, seed=None):
+        rng = np.random.default_rng(seed)
+
+        x = rng.standard_normal(size=(n, p))
+        coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(p)])
+        eps = rng.standard_normal(size=(n, p))
+
+        x_coeffs = x * coeffs
+        y = x_coeffs**2 + noise * eps
+
+        n1 = x.shape[0]
+        n2 = y.shape[0]
+        v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
+        x = np.vstack((x, y))
+        return x, v
+
+    # generate the high-dimensional quadratic data
+    X, y = quadratic(1024, 4096, noise=False, seed=0)
+    print(X.shape, y.shape)
+    print(np.sum(y) / len(y))
+    assert False
+    clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier(), random_state=0)
+    honestsk_scores = cross_val_score(clf, X, y, cv=5)
+    print(honestsk_scores)
+
+    clf = HonestTreeClassifier(tree_estimator=DecisionTreeClassifier(), random_state=0)
+    honest_scores = cross_val_score(clf, X, y, cv=5)
+    print(honest_scores)
+
+    clf = HonestTreeClassifier(random_state=0)
+    honest_scores = cross_val_score(clf, X, y, cv=5)
+    print(honest_scores)
+
+    skest = skDecisionTreeClassifier(random_state=0)
+    sk_scores = cross_val_score(skest, X, y, cv=5)
+
+    est = DecisionTreeClassifier(random_state=0)
+    scores = cross_val_score(est, X, y, cv=5)
+
+    print(sk_scores, scores)
+    print(np.mean(sk_scores), np.mean(scores))
+    assert np.mean(sk_scores) == np.mean(scores)
+    assert False