From 042cd02d80a78129957b92ed8070bcd262dae23a Mon Sep 17 00:00:00 2001 From: Sambit Panda <36676569+sampan501@users.noreply.github.com> Date: Tue, 10 Oct 2023 20:01:10 -0400 Subject: [PATCH] Fix for MIGHT p-value shuffling all columns by default (#140) * fix for coleman algorithm API --------- Signed-off-by: Sampan <36676569+sampan501@users.noreply.github.com> Co-authored-by: Adam Li --- doc/whats_new/v0.3.rst | 2 ++ sktree/experimental/sdf.py | 1 - sktree/stats/forestht.py | 6 +++- sktree/stats/tests/test_forestht.py | 44 ++++++++++++++++++++++++----- 4 files changed, 44 insertions(+), 9 deletions(-) diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst index 238fcb9ca..ebae29995 100644 --- a/doc/whats_new/v0.3.rst +++ b/doc/whats_new/v0.3.rst @@ -13,6 +13,7 @@ Version 0.3 Changelog --------- - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) +- |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) Code and Documentation Contributors ----------------------------------- @@ -21,4 +22,5 @@ Thanks to everyone who has contributed to the maintenance and improvement of the project since version inception, including: * `Adam Li`_ +* `Sambit Panda`_ diff --git a/sktree/experimental/sdf.py b/sktree/experimental/sdf.py index fafa028ae..a34f5c6e5 100644 --- a/sktree/experimental/sdf.py +++ b/sktree/experimental/sdf.py @@ -108,7 +108,6 @@ def __init__( monotonic_cst=None, n_swaps=1, ): - super().__init__( n_estimators=n_estimators, criterion=criterion, diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 4c425b17f..aa9e81e9b 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -429,7 +429,8 @@ def test( y : ArrayLike of shape (n_samples, n_outputs) The target matrix. covariate_index : ArrayLike, optional of shape (n_covariates,) - The index array of covariates to shuffle, by default None. + The index array of covariates to shuffle, will shuffle all columns by + default (corresponding to None). metric : str, optional The metric to compute, by default "mse". n_repeats : int, optional @@ -465,6 +466,9 @@ def test( observe_stat = self.observe_stat_ # next permute the data + if covariate_index is None: + covariate_index = np.arange(X.shape[1], dtype=int) + permute_stat, permute_posteriors, permute_samples = self.statistic( X, y, diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index f63894be7..c16eecbe2 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -358,11 +358,11 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da # if the sample_dataset_per_tree, then the indices should be different across all if sample_dataset_per_tree: - for (indices, other_indices) in combinations(clf.train_test_samples_, 2): + for indices, other_indices in combinations(clf.train_test_samples_, 2): assert not np.array_equal(indices[0], other_indices[0]) assert not np.array_equal(indices[1], other_indices[1]) else: - for (indices, other_indices) in combinations(clf.train_test_samples_, 2): + for indices, other_indices in combinations(clf.train_test_samples_, 2): assert_array_equal(indices[0], other_indices[0]) assert_array_equal(indices[1], other_indices[1]) @@ -387,7 +387,7 @@ def test_permute_per_tree_samples_consistency_with_sklearnforest(seed, sample_da @pytest.mark.parametrize("seed", [None, 0]) -def test_small_dataset(seed): +def test_small_dataset_independent(seed): n_samples = 32 n_features = 5 @@ -408,13 +408,43 @@ def test_small_dataset(seed): assert pvalue > 0.05 stat, pvalue = clf.test(X, y, metric="mi") - if seed is not None: - assert stat == 0.0 - else: - assert_almost_equal(stat, 0.0, decimal=1) + assert_almost_equal(stat, 0.0, decimal=1) assert pvalue > 0.05 +@flaky(max_runs=3) +@pytest.mark.parametrize("seed", [None, 0]) +def test_small_dataset_dependent(seed): + n_samples = 100 + n_features = 5 + rng = np.random.default_rng(seed) + + X = rng.uniform(size=(n_samples, n_features)) + X = rng.uniform(size=(n_samples // 2, n_features)) + X2 = X + 3 + X = np.vstack([X, X2]) + y = np.vstack( + [np.zeros((n_samples // 2, 1)), np.ones((n_samples // 2, 1))] + ) # Binary classification + print(X.shape, y.shape) + + clf = FeatureImportanceForestClassifier( + estimator=HonestForestClassifier( + n_estimators=10, random_state=seed, n_jobs=1, honest_fraction=0.5 + ), + test_size=0.2, + permute_per_tree=False, + sample_dataset_per_tree=False, + ) + stat, pvalue = clf.test(X, y, covariate_index=[1, 2], metric="mi") + assert ~np.isnan(pvalue) + assert ~np.isnan(stat) + assert pvalue <= 0.05 + + stat, pvalue = clf.test(X, y, metric="mi") + assert pvalue <= 0.05 + + # @pytest.mark.monitor_test # def test_memory_usage(): # n_samples = 1000