Skip to content

Commit

Permalink
Fix for MIGHT p-value shuffling all columns by default (#140)
Browse files Browse the repository at this point in the history
* fix for coleman algorithm API

---------

Signed-off-by: Sampan <36676569+sampan501@users.noreply.github.com>
Co-authored-by: Adam Li <adam2392@gmail.com>
  • Loading branch information
sampan501 and adam2392 authored Oct 11, 2023
1 parent 74bd684 commit 042cd02
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 9 deletions.
2 changes: 2 additions & 0 deletions doc/whats_new/v0.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Version 0.3
Changelog
---------
- |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`)
- |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`)

Code and Documentation Contributors
-----------------------------------
Expand All @@ -21,4 +22,5 @@ Thanks to everyone who has contributed to the maintenance and improvement of
the project since version inception, including:

* `Adam Li`_
* `Sambit Panda`_

1 change: 0 additions & 1 deletion sktree/experimental/sdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def __init__(
monotonic_cst=None,
n_swaps=1,
):

super().__init__(
n_estimators=n_estimators,
criterion=criterion,
Expand Down
6 changes: 5 additions & 1 deletion sktree/stats/forestht.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,8 @@ def test(
y : ArrayLike of shape (n_samples, n_outputs)
The target matrix.
covariate_index : ArrayLike, optional of shape (n_covariates,)
The index array of covariates to shuffle, by default None.
The index array of covariates to shuffle, will shuffle all columns by
default (corresponding to None).
metric : str, optional
The metric to compute, by default "mse".
n_repeats : int, optional
Expand Down Expand Up @@ -465,6 +466,9 @@ def test(
observe_stat = self.observe_stat_

# next permute the data
if covariate_index is None:
covariate_index = np.arange(X.shape[1], dtype=int)

permute_stat, permute_posteriors, permute_samples = self.statistic(
X,
y,
Expand Down
44 changes: 37 additions & 7 deletions sktree/stats/tests/test_forestht.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,11 +358,11 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da

# if the sample_dataset_per_tree, then the indices should be different across all
if sample_dataset_per_tree:
for (indices, other_indices) in combinations(clf.train_test_samples_, 2):
for indices, other_indices in combinations(clf.train_test_samples_, 2):
assert not np.array_equal(indices[0], other_indices[0])
assert not np.array_equal(indices[1], other_indices[1])
else:
for (indices, other_indices) in combinations(clf.train_test_samples_, 2):
for indices, other_indices in combinations(clf.train_test_samples_, 2):
assert_array_equal(indices[0], other_indices[0])
assert_array_equal(indices[1], other_indices[1])

Expand All @@ -387,7 +387,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da


@pytest.mark.parametrize("seed", [None, 0])
def test_small_dataset(seed):
def test_small_dataset_independent(seed):
n_samples = 32
n_features = 5

Expand All @@ -408,13 +408,43 @@ def test_small_dataset(seed):
assert pvalue > 0.05

stat, pvalue = clf.test(X, y, metric="mi")
if seed is not None:
assert stat == 0.0
else:
assert_almost_equal(stat, 0.0, decimal=1)
assert_almost_equal(stat, 0.0, decimal=1)
assert pvalue > 0.05


@flaky(max_runs=3)
@pytest.mark.parametrize("seed", [None, 0])
def test_small_dataset_dependent(seed):
n_samples = 100
n_features = 5
rng = np.random.default_rng(seed)

X = rng.uniform(size=(n_samples, n_features))
X = rng.uniform(size=(n_samples // 2, n_features))
X2 = X + 3
X = np.vstack([X, X2])
y = np.vstack(
[np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))]
) # Binary classification
print(X.shape, y.shape)

clf = FeatureImportanceForestClassifier(
estimator=HonestForestClassifier(
n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5
),
test_size=0.2,
permute_per_tree=False,
sample_dataset_per_tree=False,
)
stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi")
assert ~np.isnan(pvalue)
assert ~np.isnan(stat)
assert pvalue <= 0.05

stat, pvalue = clf.test(X, y, metric="mi")
assert pvalue <= 0.05


# @pytest.mark.monitor_test
# def test_memory_usage():
# n_samples = 1000
Expand Down

0 comments on commit 042cd02

Please sign in to comment.