Skip to content

Commit

Permalink
[FIX] Adding fix to allow any kind of tree for honesttree (#158)
Browse files Browse the repository at this point in the history
* Adding fix to allow any kind of tree for honesttree

---------

Signed-off-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
adam2392 authored Oct 27, 2023
1 parent a81c597 commit d6d0b16
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 17 deletions.
1 change: 1 addition & 0 deletions doc/whats_new/v0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ Changelog
- |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
- |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`)
- |Fix| Fixed usage of ``feature_importances_`` property in ``HonestForestClassifier``, by `Adam Li`_ (:pr:`156`)
- |Fix| Fixed ``HonestForestClassifier`` to allow decision-trees from sklearn, albeit with a limited API, by `Adam Li`_ (:pr:`158`)

Code and Documentation Contributors
-----------------------------------
Expand Down
43 changes: 30 additions & 13 deletions sktree/tree/_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import numpy as np
from sklearn.base import ClassifierMixin, MetaEstimatorMixin, _fit_context, clone
from sklearn.ensemble._base import _set_random_states
from sklearn.tree._classes import BaseDecisionTree as skBaseDecisionTree
from sklearn.utils.multiclass import _check_partial_fit_first_call, check_classification_targets
from sklearn.utils.validation import check_is_fitted, check_X_y

Expand Down Expand Up @@ -555,9 +554,10 @@ def _fit(
store_leaf_values=self.store_leaf_values,
)
else:
# XXX: Remove this?
# we throw an error if the user is using trees from sklearn:main
if isinstance(self.tree_estimator, skBaseDecisionTree):
raise RuntimeError("Instead of using sklearn.tree, use trees import from sktree.")
# if isinstance(self.tree_estimator, skBaseDecisionTree):
# raise RuntimeError("Instead of using sklearn.tree, use trees import from sktree.")

# XXX: maybe error out if the tree_estimator is already fitted
self.estimator_ = clone(self.tree_estimator)
Expand All @@ -575,23 +575,40 @@ def _fit(
random_state=self.random_state,
min_impurity_decrease=self.min_impurity_decrease,
ccp_alpha=self.ccp_alpha,
monotonic_cst=self.monotonic_cst,
store_leaf_values=self.store_leaf_values,
)
)
try:
self.estimator_.set_params(**dict(monotonic_cst=self.monotonic_cst))
self.estimator_.set_params(
**dict(
store_leaf_values=self.store_leaf_values,
)
)
except Exception:
print("Using sklearn tree")

if self.random_state is not None:
_set_random_states(self.estimator_, self.random_state)

# Learn structure on subsample
self.estimator_._fit(
X,
y,
sample_weight=_sample_weight,
check_input=check_input,
missing_values_in_feature_mask=missing_values_in_feature_mask,
classes=classes,
)
# XXX: this allows us to use BaseDecisionTree without partial_fit API
try:
self.estimator_._fit(
X,
y,
sample_weight=_sample_weight,
check_input=check_input,
missing_values_in_feature_mask=missing_values_in_feature_mask,
classes=classes,
)
except Exception:
self.estimator_._fit(
X,
y,
sample_weight=_sample_weight,
check_input=check_input,
missing_values_in_feature_mask=missing_values_in_feature_mask,
)
self._inherit_estimator_attributes()

# update the number of classes, unsplit
Expand Down
115 changes: 111 additions & 4 deletions sktree/tree/tests/test_honest_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier as skDecisionTreeClassifier
from sklearn.utils.estimator_checks import parametrize_with_checks

Expand Down Expand Up @@ -120,11 +121,117 @@ def test_sklearn_compatible_estimator(estimator, check):
check(estimator)


def test_error_with_sklearn_trees():
def test_with_sklearn_trees():
X = np.ones((20, 4))
X[10:] *= -1
y = [0] * 10 + [1] * 10

with pytest.raises(RuntimeError, match="Instead of using sklearn.tree"):
clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier())
clf.fit(X, y)
# with pytest.raises(RuntimeError, match="Instead of using sklearn.tree"):
clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier())
clf.fit(X, y)


def k_sample_transform(inputs, test_type="normal"):
"""
Computes a `k`-sample transform of the inputs.
For :math:`k` groups, this creates two matrices, the first vertically stacks the
inputs.
In order to use this function, the inputs must have the same number of dimensions
:math:`p` and can have varying number of samples :math:`n`. The second output is a
label
matrix the one-hoc encodes the groups. The outputs are thus ``(N, p)`` and
``(N, k)`` where `N` is the total number of samples. In the case where the test
a random forest based tests, it creates a ``(N, 1)`` where the entries are
varlues from 1 to :math:`k` based on the number of samples.
Parameters
----------
inputs : list of ndarray
A list of the inputs. All inputs must be ``(n, p)`` where `n` is the number
of samples and `p` is the number of dimensions. `n` can vary between samples,
but `p` must be the same among all the samples.
test_type : {"normal", "rf"}, default: "normal"
Whether to one-hoc encode the inputs ("normal") or use a one-dimensional
categorical encoding ("rf").
Returns
-------
u : ndarray
The matrix of concatenated inputs of shape ``(N, p)``.
v : ndarray
The label matrix of shape ``(N, k)`` ("normal") or ``(N, 1)`` ("rf").
"""
n_inputs = len(inputs)
u = np.vstack(inputs)
if np.var(u) == 0:
raise ValueError("Test cannot be run, the inputs have 0 variance")

if test_type == "rf":
v = np.vstack([np.repeat(i, inputs[i].shape[0]).reshape(-1, 1) for i in range(n_inputs)])
elif test_type == "normal":
if n_inputs == 2:
n1 = inputs[0].shape[0]
n2 = inputs[1].shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
else:
vs = []
for i in range(n_inputs):
n = inputs[i].shape[0]
encode = np.zeros(shape=(n, n_inputs))
encode[:, i] = np.ones(shape=n)
vs.append(encode)
v = np.concatenate(vs)
else:
raise ValueError("test_type must be normal or rf")

return u, v


@pytest.mark.skip()
def test_sklearn_tree_regression():
"""Test against regression in power-curves discussed in:"""

def quadratic(n, p, noise=False, seed=None):
rng = np.random.default_rng(seed)

x = rng.standard_normal(size=(n, p))
coeffs = np.array([np.exp(-0.0325 * (i + 24)) for i in range(p)])
eps = rng.standard_normal(size=(n, p))

x_coeffs = x * coeffs
y = x_coeffs**2 + noise * eps

n1 = x.shape[0]
n2 = y.shape[0]
v = np.vstack([np.zeros((n1, 1)), np.ones((n2, 1))])
x = np.vstack((x, y))
return x, v

# generate the high-dimensional quadratic data
X, y = quadratic(1024, 4096, noise=False, seed=0)
print(X.shape, y.shape)
print(np.sum(y) / len(y))
assert False
clf = HonestTreeClassifier(tree_estimator=skDecisionTreeClassifier(), random_state=0)
honestsk_scores = cross_val_score(clf, X, y, cv=5)
print(honestsk_scores)

clf = HonestTreeClassifier(tree_estimator=DecisionTreeClassifier(), random_state=0)
honest_scores = cross_val_score(clf, X, y, cv=5)
print(honest_scores)

clf = HonestTreeClassifier(random_state=0)
honest_scores = cross_val_score(clf, X, y, cv=5)
print(honest_scores)

skest = skDecisionTreeClassifier(random_state=0)
sk_scores = cross_val_score(skest, X, y, cv=5)

est = DecisionTreeClassifier(random_state=0)
scores = cross_val_score(est, X, y, cv=5)

print(sk_scores, scores)
print(np.mean(sk_scores), np.mean(scores))
assert np.mean(sk_scores) == np.mean(scores)
assert False

0 comments on commit d6d0b16

Please sign in to comment.