From 359ea75740910513e582f5fc6b5a74d7e6da03d9 Mon Sep 17 00:00:00 2001
From: Yuxin <99897042+YuxinB@users.noreply.github.com>
Date: Thu, 19 Oct 2023 14:05:34 -0400
Subject: [PATCH] Stratify sampling when split train/test data (#143)

* Stratify sampling when split train/test data

---------

Co-authored-by: Haoyin Xu <haoyinxu@gmail.com>
Co-authored-by: Adam Li <adam2392@gmail.com>
Co-authored-by: Sambit Panda <36676569+sampan501@users.noreply.github.com>
---
 doc/whats_new/_contributors.rst               |  1 +
 doc/whats_new/v0.3.rst                        |  3 +-
 ...t_MI_genuine_hypothesis_testing_forest.py} | 16 +++----
 .../plot_MI_imbalanced_hyppo_testing.py       |  8 ++--
 requirements.txt                              |  1 +
 sktree/stats/forestht.py                      | 42 ++++++++++++-------
 sktree/stats/tests/test_forestht.py           | 32 ++++++++++++++
 7 files changed, 76 insertions(+), 27 deletions(-)
 rename examples/hypothesis_testing/{plot_MI_gigantic_hypothesis_testing_forest.py => plot_MI_genuine_hypothesis_testing_forest.py} (94%)

diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst
index 3e5ca2110..eb441d66d 100644
--- a/doc/whats_new/_contributors.rst
+++ b/doc/whats_new/_contributors.rst
@@ -26,3 +26,4 @@
 .. _SUKI-O : https://github.com/SUKI-O
 .. _Ronan Perry : https://rflperry.github.io/
 .. _Haoyin Xu : https://github.com/PSSF23
+.. _Yuxin Bai : https://github.com/YuxinB
diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst
index fec97bb01..7b163ef19 100644
--- a/doc/whats_new/v0.3.rst
+++ b/doc/whats_new/v0.3.rst
@@ -15,6 +15,7 @@ Changelog
 - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`)
 - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`)
 - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`)
+- |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`)
 
 Code and Documentation Contributors
 -----------------------------------
@@ -24,4 +25,4 @@ the project since version inception, including:
 
 * `Adam Li`_
 * `Sambit Panda`_
-
+* `Yuxin Bai`_
diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
similarity index 94%
rename from examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py
rename to examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
index 423bc63dc..e6831a9e7 100644
--- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py
+++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py
@@ -1,7 +1,7 @@
 """
-===========================================================
-Mutual Information for Gigantic Hypothesis Testing (MIGHT)
-===========================================================
+=========================================================
+Mutual Information for Genuine Hypothesis Testing (MIGHT)
+=========================================================
 
 An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
 multivariate hypothesis test, on simulated datasets. Here, we present a simulation
@@ -49,8 +49,8 @@
 # We simulate the two feature sets, and the target variable. We then combine them
 # into a single dataset to perform hypothesis testing.
 
-n_samples = 1000
-n_features_set = 500
+n_samples = 2000
+n_features_set = 20
 mean = 1.0
 sigma = 2.0
 beta = 5.0
@@ -91,7 +91,7 @@
 # computed as the proportion of samples in the null distribution that are less than the
 # observed test statistic.
 
-n_estimators = 200
+n_estimators = 100
 max_features = "sqrt"
 test_size = 0.2
 n_repeats = 1000
@@ -103,12 +103,12 @@
         max_features=max_features,
         tree_estimator=DecisionTreeClassifier(),
         random_state=seed,
-        honest_fraction=0.7,
+        honest_fraction=0.25,
         n_jobs=n_jobs,
     ),
     random_state=seed,
     test_size=test_size,
-    permute_per_tree=True,
+    permute_per_tree=False,
     sample_dataset_per_tree=False,
 )
 
diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
index 882f80c3d..c8a5478a4 100644
--- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
+++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py
@@ -1,7 +1,7 @@
 """
-===============================================================================
-Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data
-===============================================================================
+==============================================================================
+Mutual Information for Genuine Hypothesis Testing (MIGHT) with Imbalanced Data
+==============================================================================
 
 Here, we demonstrate how to do hypothesis testing on highly imbalanced data
 in terms of their feature-set dimensionalities.
@@ -17,7 +17,7 @@
 
 For other examples of hypothesis testing, see the following:
 
-- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_gigantic_hypothesis_testing_forest.py`
+- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_genuine_hypothesis_testing_forest.py`
 - :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py`
 
 For more information on the multi-view decision-tree, see
diff --git a/requirements.txt b/requirements.txt
index 99963814e..978f90fce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 numpy>=1.25
 scipy>=1.11
 scikit-learn>=1.3.1
+
diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py
index 4d6dc7b77..56a044c5c 100644
--- a/sktree/stats/forestht.py
+++ b/sktree/stats/forestht.py
@@ -122,6 +122,7 @@ def __init__(
         test_size=0.2,
         permute_per_tree=True,
         sample_dataset_per_tree=True,
+        stratify=False,
     ):
         self.estimator = estimator
         self.random_state = random_state
@@ -129,6 +130,7 @@ def __init__(
         self.test_size = test_size
         self.permute_per_tree = permute_per_tree
         self.sample_dataset_per_tree = sample_dataset_per_tree
+        self.stratify = stratify
 
         self.n_samples_test_ = None
         self._n_samples_ = None
@@ -160,8 +162,9 @@ def reset(self):
         self.n_features_in_ = None
         self._is_fitted = False
         self._seeds = None
+        self._y = None
 
-    def _get_estimators_indices(self, sample_separate=False):
+    def _get_estimators_indices(self, stratifier=None, sample_separate=False):
         indices = np.arange(self._n_samples_, dtype=int)
 
         # Get drawn indices along both sample and feature axes
@@ -191,7 +194,11 @@ def _get_estimators_indices(self, sample_separate=False):
                 # Operations accessing random_state must be performed identically
                 # to those in `_parallel_build_trees()`
                 indices_train, indices_test = train_test_split(
-                    indices, test_size=self.test_size, shuffle=True, random_state=seed
+                    indices,
+                    test_size=self.test_size,
+                    shuffle=True,
+                    stratify=stratifier,
+                    random_state=seed,
                 )
 
                 yield indices_train, indices_test
@@ -202,12 +209,13 @@ def _get_estimators_indices(self, sample_separate=False):
                 else:
                     self._seeds = self.estimator_.random_state
 
-            # TODO: make random_state consistent
             indices_train, indices_test = train_test_split(
                 indices,
                 test_size=self.test_size,
+                stratify=stratifier,
                 random_state=self._seeds,
             )
+
             for _ in self.estimator_.estimators_:
                 yield indices_train, indices_test
 
@@ -227,9 +235,12 @@ def train_test_samples_(self):
         if self._n_samples_ is None:
             raise RuntimeError("The estimator must be fitted before accessing this attribute.")
 
+        # Stratifier uses a cached _y attribute if available
+        stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None
+
         return [
             (indices_train, indices_test)
-            for indices_train, indices_test in self._get_estimators_indices()
+            for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier)
         ]
 
     def _statistic(
@@ -329,6 +340,8 @@ def statistic(
 
         if self._n_samples_ is None:
             self._n_samples_, self.n_features_in_ = X.shape
+
+        # Infer type of target y
         if self._type_of_target_ is None:
             self._type_of_target_ = type_of_target(y)
 
@@ -339,9 +352,9 @@ def statistic(
             self.permuted_estimator_ = self._get_estimator()
             estimator = self.permuted_estimator_
 
-        # Infer type of target y
-        if not hasattr(self, "_type_of_target"):
-            self._type_of_target_ = type_of_target(y)
+        # Store a cache of the y variable
+        if is_classifier(self._get_estimator()):
+            self._y = y.copy()
 
         # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking
         # and then setting the internal meta data structures
@@ -462,10 +475,10 @@ def test(
             observe_posteriors = self.observe_posteriors_
             observe_stat = self.observe_stat_
 
-        # next permute the data
         if covariate_index is None:
             covariate_index = np.arange(X.shape[1], dtype=int)
 
+        # next permute the data
         permute_stat, permute_posteriors, permute_samples = self.statistic(
             X,
             y,
@@ -724,9 +737,7 @@ def _statistic(
                     self.permute_per_tree,
                     self._type_of_target_,
                 )
-                for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
-                )
+                for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_)
             )
         else:
             # fitting a forest will only get one unique train/test split
@@ -825,6 +836,9 @@ class FeatureImportanceForestClassifier(BaseForestHT):
     sample_dataset_per_tree : bool, default=False
         Whether to sample the dataset per tree or per forest.
 
+    stratify : bool, default=True
+        Whether to stratify the samples by class labels.
+
     Attributes
     ----------
     estimator_ : BaseForest
@@ -877,6 +891,7 @@ def __init__(
         test_size=0.2,
         permute_per_tree=True,
         sample_dataset_per_tree=True,
+        stratify=True,
     ):
         super().__init__(
             estimator=estimator,
@@ -885,6 +900,7 @@ def __init__(
             test_size=test_size,
             permute_per_tree=permute_per_tree,
             sample_dataset_per_tree=sample_dataset_per_tree,
+            stratify=stratify,
         )
 
     def _get_estimator(self):
@@ -945,9 +961,7 @@ def _statistic(
                     self.permute_per_tree,
                     self._type_of_target_,
                 )
-                for idx, (indices_train, indices_test) in enumerate(
-                    self._get_estimators_indices(sample_separate=True)
-                )
+                for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_)
             )
         else:
             # fitting a forest will only get one unique train/test split
diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py
index cecf34b8c..e71e5e09b 100644
--- a/sktree/stats/tests/test_forestht.py
+++ b/sktree/stats/tests/test_forestht.py
@@ -69,6 +69,38 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree):
         est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi")
 
 
+@pytest.mark.parametrize("sample_dataset_per_tree", [True, False])
+def test_featureimportance_forest_stratified(sample_dataset_per_tree):
+    est = FeatureImportanceForestClassifier(
+        estimator=RandomForestClassifier(
+            n_estimators=10,
+            random_state=seed,
+        ),
+        permute_per_tree=True,
+        test_size=0.7,
+        random_state=seed,
+        sample_dataset_per_tree=sample_dataset_per_tree,
+    )
+    n_samples = 100
+    est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi")
+
+    _, indices_test = est.train_test_samples_[0]
+    y_test = iris_y[indices_test]
+
+    assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), (
+        f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}"
+    )
+
+    est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi")
+
+    _, indices_test = est.train_test_samples_[0]
+    y_test = iris_y[indices_test]
+
+    assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), (
+        f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}"
+    )
+
+
 def test_featureimportance_forest_errors():
     permute_per_tree = False
     sample_dataset_per_tree = True