From 8f7d52fc013e3f3ec85636284f6876ad0ccfeebe Mon Sep 17 00:00:00 2001 From: Haoyin Xu Date: Tue, 17 Oct 2023 11:59:39 -0400 Subject: [PATCH 1/4] FIX add max_fpr argument to auc example --- examples/hypothesis_testing/plot_might_auc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/hypothesis_testing/plot_might_auc.py b/examples/hypothesis_testing/plot_might_auc.py index 101ad5b7f..fd032972a 100644 --- a/examples/hypothesis_testing/plot_might_auc.py +++ b/examples/hypothesis_testing/plot_might_auc.py @@ -94,6 +94,7 @@ y, metric=metric, return_posteriors=True, + max_fpr=max_fpr, ) print(f"ASH-90 / Partial AUC: {stat}") @@ -110,6 +111,7 @@ y, metric=metric, return_posteriors=True, + max_fpr=max_fpr, ) print(f"ASH-90 / Partial AUC: {stat}") From 9ad065efbf41b0acf41a2978d1068f6b532ac64e Mon Sep 17 00:00:00 2001 From: Adam Li Date: Tue, 17 Oct 2023 14:36:46 -0400 Subject: [PATCH 2/4] [DOC] Adding citation file (#144) * Adding citation file --------- Signed-off-by: Adam Li Co-authored-by: Sambit Panda Co-authored-by: Haoyin Xu --- .github/workflows/cffconvert.yml | 22 ++++++++++++++++++++++ CITATION.cff | 31 +++++++++++++++++++++++++++++++ README.md | 1 + sktree/_lib/sklearn_fork | 2 +- style_requirements.txt | 9 +++++++++ 5 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/cffconvert.yml create mode 100644 CITATION.cff create mode 100644 style_requirements.txt diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml new file mode 100644 index 000000000..60fd8f62b --- /dev/null +++ b/.github/workflows/cffconvert.yml @@ -0,0 +1,22 @@ +name: cffconvert + +on: + push: + paths: + - CITATION.cff + pull_request: + paths: + - CITATION.cff + +jobs: + validate: + name: "validate" + runs-on: ubuntu-latest + steps: + - name: Check out a copy of the repository + uses: actions/checkout@v4 + + - name: Check whether the citation metadata from CITATION.cff is valid + uses: citation-file-format/cffconvert-github-action@2.0.0 + with: + args: "--validate" \ No newline at end of file diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..e3bf96722 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,31 @@ +# YAML 1.2 +--- +# Metadata for citation of this software according to the CFF format (https://citation-file-format.github.io/) +cff-version: 1.2.0 +title: "Scikit-tree: Modern decision-trees compatible with scikit-learn in Python." +abstract: "scikit-tree is a scikit-learn compatible API for building state-of-the-art decision trees. These include unsupervised trees, oblique trees, uncertainty trees, quantile trees and causal trees." +authors: + - given-names: Adam + family-names: Li + affiliation: "Department of Computer Science, Columbia University, New York, NY, USA" + orcid: "https://orcid.org/0000-0001-8421-365X" + - given-names: Sambit + family-names: Panda + affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA" + orcid: "https://orcid.org/0000-0001-8455-4243" + - given-names: Haoyin + family-names: Xu + affiliation: "Department of Biomedical Engineering, Johns Hopkins University, Baltimore, MD, USA" + orcid: "https://orcid.org/0000-0001-8235-4950" +type: software +repository-code: "https://github.com/neurodata/scikit-tree" +license: 'BSD-3-Clause' +keywords: + - random forest + - oblique trees + - honest forests + - statisical learning + - machine learning +message: >- + Please cite this software using the metadata from + 'preferred-citation' in the CITATION.cff file. diff --git a/README.md b/README.md index 12076150c..ac815d9ec 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ [![codecov](https://codecov.io/gh/neurodata/scikit-tree/branch/main/graph/badge.svg?token=H1reh7Qwf4)](https://codecov.io/gh/neurodata/scikit-tree) [![PyPI Download count](https://img.shields.io/pypi/dm/scikit-tree.svg)](https://pypistats.org/packages/scikit-tree) [![Latest PyPI release](https://img.shields.io/pypi/v/scikit-tree.svg)](https://pypi.org/project/scikit-tree/) +[![DOI](https://zenodo.org/badge/491260497.svg)](https://zenodo.org/doi/10.5281/zenodo.8412279) scikit-tree =========== diff --git a/sktree/_lib/sklearn_fork b/sktree/_lib/sklearn_fork index 6c7a5f44e..1adb20907 160000 --- a/sktree/_lib/sklearn_fork +++ b/sktree/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 6c7a5f44eb4ec3bea5dd6a9e4d5db748d12b209e +Subproject commit 1adb209077f12adac8f760196ae5260abab0cbdd diff --git a/style_requirements.txt b/style_requirements.txt new file mode 100644 index 000000000..c3fb899a6 --- /dev/null +++ b/style_requirements.txt @@ -0,0 +1,9 @@ +mypy +black +isort +flake8 +bandit +pydocstyle +codespell +toml +cython-lint \ No newline at end of file From 860d1979503de104b5f993c17164dc36353a82f9 Mon Sep 17 00:00:00 2001 From: Adam Li Date: Thu, 19 Oct 2023 10:47:55 -0400 Subject: [PATCH 3/4] [DOC] Adding development docs to make sure it is clear how to do dev (#150) * Adding development docs to make sure it is clear how to do dev Signed-off-by: Adam Li * Update reqs Signed-off-by: Adam Li * Fix rendering of cli commands Signed-off-by: Adam Li --------- Signed-off-by: Adam Li --- DEVELOPING.md | 122 ++++++++++++++++++++++++++++++++++++++++++++--- requirements.txt | 2 +- 2 files changed, 117 insertions(+), 7 deletions(-) diff --git a/DEVELOPING.md b/DEVELOPING.md index 200789d7c..9fdc18090 100644 --- a/DEVELOPING.md +++ b/DEVELOPING.md @@ -1,14 +1,103 @@ + + +- [Requirements](#requirements) +- [Setting up your development environment](#setting-up-your-development-environment) +- [Building the project from source](#building-the-project-from-source) +- [Development Tasks](#development-tasks) + - [Basic Verification](#basic-verification) + - [Docsite](#docsite) + - [Details](#details) + - [Coding Style](#coding-style) + - [Lint](#lint) + - [Type checking](#type-checking) + - [Unit tests](#unit-tests) +- [Advanced Updating submodules](#advanced-updating-submodules) +- [Cython and C++](#cython-and-c) +- [Making a Release](#making-a-release) + + + # Requirements -* Python 3.8+ -* Poetry (`curl -sSL https://install.python-poetry.org | python - --version=1.2.2`) -For the other requirements, inspect the ``pyproject.toml`` file. If you are updated the dependencies, please run `poetry update` to update the +* Python 3.9+ +* numpy>=1.25 +* scipy>=1.11 +* scikit-learn>=1.3.1 + +For the other requirements, inspect the ``pyproject.toml`` file. + +# Setting up your development environment + +We recommend using miniconda, as python virtual environments may not setup properly compilers necessary for our compiled code. For detailed information on setting up and managing conda environments, see https://conda.io/docs/test-drive.html. + + + + conda create -n sktree + conda activate sktree + +**Make sure you specify a Python version if your system defaults to anything less than Python 3.9.** + +**Any commands should ALWAYS be after you have activated your conda environment.** +Next, install necessary build dependencies. For more information, see https://scikit-learn.org/stable/developers/advanced_installation.html. + + conda install -c conda-forge joblib threadpoolctl pytest compilers llvm-openmp + +Assuming these steps have worked properly and you have read and followed any necessary scikit-learn advanced installation instructions, you can then install dependencies for scikit-tree. + +If you are developing locally, you will need the build dependencies to compile the Cython / C++ code: + + pip install -r build_requirements.txt + +Other requirements can be installed as such: + + pip install -r requirements.txt + pip install -r style_requirements.txt + pip install -r test_requirements.txt + pip install -r doc_requirements.txt + +# Building the project from source + +We leverage meson to build scikit-tree from source. We utilize a CLI tool, called [spin](https://github.com/scientific-python/spin), which wraps certain meson commands to make building easier. + +For example, the following command will build the project completely from scratch + + spin build --clean + +If you have part of the build already done, you can run: + + spin build + +The following command will test the project + + spin test + +For other commands, see + + spin --help + +Note at this stage, you will be unable to run Python commands directly. For example, ``pytest ./sktree`` will not work. + +However, after installing and building the project from source using meson, you can leverage editable installs to make testing code changes much faster. For more information on meson-python's progress supporting editable installs in a better fashion, see https://meson-python.readthedocs.io/en/latest/how-to-guides/editable-installs.html. + + pip install --no-build-isolation --editable . + +**Note: editable installs for scikit-tree REQUIRE you to have built the project using meson already.** This will now link the meson build to your Python runtime. Now if you run + + pytest ./sktree + +the unit-tests should run. # Development Tasks -There are a series of top-level tasks available through Poetry. These can each be run via +There are a series of top-level tasks available through Poetry. If you are updated the dependencies, please run `poetry update` to update the lock file. These can each be run via `poetry run poe ` +To do so, first install poetry and poethepoet. + + pip install poetry poethepoet + +Now, you are ready to run quick commands to format the codebase, lint the codebase and type-check the codebase. + ### Basic Verification * **format** - runs the suite of formatting tools applying tools to make code compliant * **format_check** - runs the suite of formatting tools checking for compliance @@ -53,6 +142,23 @@ In order for any code to be added to the repository, we require unit tests to pa poetry run poe unit_test +# (Advanced) Updating submodules + +Scikit-tree relies on a submodule of a forked-version of scikit-learn for certain Python and Cython code that extends the ``DecisionTree*`` models. Usually, if a developer is making changes, they should go over to the ``submodulev3`` branch on ``https://github.com/neurodata/scikit-learn`` and +submit a PR to make changes to the submodule. + +This should **ALWAYS** be supported by some use-case in scikit-tree. We want the minimal amount of code-change in our forked version of scikit-learn to make it very easy to merge in upstream changes, bug fixes and features for tree-based code. + +Once a PR is submitted and merged, the developer can update the submodule here in scikit-tree, so that we leverage the new commit. You **must** update the submodule commit ID and also commit this change, so that way the build leverages the new submodule commit ID. + + git submodule update --init --recursive --remote + git add -A + git commit -m "Update submodule" -s + +Now, you can re-build the project using the latest submodule changes. + + spin build --clean + # Cython and C++ The general design of scikit-tree follows that of the tree-models inside scikit-learn, where tree-based models are inherently Cythonized, or written with C++. Then the actual forest (e.g. RandomForest, or ExtraForest) is just a Python API wrapper that creates an ensemble of the trees. @@ -68,13 +174,17 @@ https://github.com/neurodata/scikit-tree/actions/workflows/build_wheels.yml will 2. Upload wheels to test PyPi - twine upload --repository-url https://test.pypi.org/legacy/ dist/* +``` +twine upload --repository-url https://test.pypi.org/legacy/ dist/* +``` Verify that installations work as expected on your machine. 3. Upload wheels - twine upload dist/* +``` +twine upload dist/* +``` or if you have two-factor authentication enabled: https://pypi.org/help/#apitoken diff --git a/requirements.txt b/requirements.txt index dc42ea7e7..99963814e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ numpy>=1.25 -scipy +scipy>=1.11 scikit-learn>=1.3.1 From 359ea75740910513e582f5fc6b5a74d7e6da03d9 Mon Sep 17 00:00:00 2001 From: Yuxin <99897042+YuxinB@users.noreply.github.com> Date: Thu, 19 Oct 2023 14:05:34 -0400 Subject: [PATCH 4/4] Stratify sampling when split train/test data (#143) * Stratify sampling when split train/test data --------- Co-authored-by: Haoyin Xu Co-authored-by: Adam Li Co-authored-by: Sambit Panda <36676569+sampan501@users.noreply.github.com> --- doc/whats_new/_contributors.rst | 1 + doc/whats_new/v0.3.rst | 3 +- ...t_MI_genuine_hypothesis_testing_forest.py} | 16 +++---- .../plot_MI_imbalanced_hyppo_testing.py | 8 ++-- requirements.txt | 1 + sktree/stats/forestht.py | 42 ++++++++++++------- sktree/stats/tests/test_forestht.py | 32 ++++++++++++++ 7 files changed, 76 insertions(+), 27 deletions(-) rename examples/hypothesis_testing/{plot_MI_gigantic_hypothesis_testing_forest.py => plot_MI_genuine_hypothesis_testing_forest.py} (94%) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index 3e5ca2110..eb441d66d 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -26,3 +26,4 @@ .. _SUKI-O : https://github.com/SUKI-O .. _Ronan Perry : https://rflperry.github.io/ .. _Haoyin Xu : https://github.com/PSSF23 +.. _Yuxin Bai : https://github.com/YuxinB diff --git a/doc/whats_new/v0.3.rst b/doc/whats_new/v0.3.rst index fec97bb01..7b163ef19 100644 --- a/doc/whats_new/v0.3.rst +++ b/doc/whats_new/v0.3.rst @@ -15,6 +15,7 @@ Changelog - |Fix| Fixes a bug in consistency of train/test samples when ``random_state`` is not set in FeatureImportanceForestClassifier and FeatureImportanceForestRegressor, by `Adam Li`_ (:pr:`135`) - |Fix| Fixes a bug where covariate indices were not shuffled by default when running FeatureImportanceForestClassifier and FeatureImportanceForestRegressor test methods, by `Sambit Panda`_ (:pr:`140`) - |Enhancement| Add multi-view splitter for axis-aligned decision trees, by `Adam Li`_ (:pr:`129`) +- |Enhancement| Add stratified sampling option to ``FeatureImportance*`` via the ``stratify`` keyword argument, by `Yuxin Bai`_ (:pr:`143`) Code and Documentation Contributors ----------------------------------- @@ -24,4 +25,4 @@ the project since version inception, including: * `Adam Li`_ * `Sambit Panda`_ - +* `Yuxin Bai`_ diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py similarity index 94% rename from examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py rename to examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py index 423bc63dc..e6831a9e7 100644 --- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest.py +++ b/examples/hypothesis_testing/plot_MI_genuine_hypothesis_testing_forest.py @@ -1,7 +1,7 @@ """ -=========================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) -=========================================================== +========================================================= +Mutual Information for Genuine Hypothesis Testing (MIGHT) +========================================================= An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric multivariate hypothesis test, on simulated datasets. Here, we present a simulation @@ -49,8 +49,8 @@ # We simulate the two feature sets, and the target variable. We then combine them # into a single dataset to perform hypothesis testing. -n_samples = 1000 -n_features_set = 500 +n_samples = 2000 +n_features_set = 20 mean = 1.0 sigma = 2.0 beta = 5.0 @@ -91,7 +91,7 @@ # computed as the proportion of samples in the null distribution that are less than the # observed test statistic. -n_estimators = 200 +n_estimators = 100 max_features = "sqrt" test_size = 0.2 n_repeats = 1000 @@ -103,12 +103,12 @@ max_features=max_features, tree_estimator=DecisionTreeClassifier(), random_state=seed, - honest_fraction=0.7, + honest_fraction=0.25, n_jobs=n_jobs, ), random_state=seed, test_size=test_size, - permute_per_tree=True, + permute_per_tree=False, sample_dataset_per_tree=False, ) diff --git a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py index 882f80c3d..c8a5478a4 100644 --- a/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py +++ b/examples/hypothesis_testing/plot_MI_imbalanced_hyppo_testing.py @@ -1,7 +1,7 @@ """ -=============================================================================== -Mutual Information for Gigantic Hypothesis Testing (MIGHT) with Imbalanced Data -=============================================================================== +============================================================================== +Mutual Information for Genuine Hypothesis Testing (MIGHT) with Imbalanced Data +============================================================================== Here, we demonstrate how to do hypothesis testing on highly imbalanced data in terms of their feature-set dimensionalities. @@ -17,7 +17,7 @@ For other examples of hypothesis testing, see the following: -- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_gigantic_hypothesis_testing_forest.py` +- :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_MI_genuine_hypothesis_testing_forest.py` - :ref:`sphx_glr_auto_examples_hypothesis_testing_plot_might_auc.py` For more information on the multi-view decision-tree, see diff --git a/requirements.txt b/requirements.txt index 99963814e..978f90fce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ numpy>=1.25 scipy>=1.11 scikit-learn>=1.3.1 + diff --git a/sktree/stats/forestht.py b/sktree/stats/forestht.py index 4d6dc7b77..56a044c5c 100644 --- a/sktree/stats/forestht.py +++ b/sktree/stats/forestht.py @@ -122,6 +122,7 @@ def __init__( test_size=0.2, permute_per_tree=True, sample_dataset_per_tree=True, + stratify=False, ): self.estimator = estimator self.random_state = random_state @@ -129,6 +130,7 @@ def __init__( self.test_size = test_size self.permute_per_tree = permute_per_tree self.sample_dataset_per_tree = sample_dataset_per_tree + self.stratify = stratify self.n_samples_test_ = None self._n_samples_ = None @@ -160,8 +162,9 @@ def reset(self): self.n_features_in_ = None self._is_fitted = False self._seeds = None + self._y = None - def _get_estimators_indices(self, sample_separate=False): + def _get_estimators_indices(self, stratifier=None, sample_separate=False): indices = np.arange(self._n_samples_, dtype=int) # Get drawn indices along both sample and feature axes @@ -191,7 +194,11 @@ def _get_estimators_indices(self, sample_separate=False): # Operations accessing random_state must be performed identically # to those in `_parallel_build_trees()` indices_train, indices_test = train_test_split( - indices, test_size=self.test_size, shuffle=True, random_state=seed + indices, + test_size=self.test_size, + shuffle=True, + stratify=stratifier, + random_state=seed, ) yield indices_train, indices_test @@ -202,12 +209,13 @@ def _get_estimators_indices(self, sample_separate=False): else: self._seeds = self.estimator_.random_state - # TODO: make random_state consistent indices_train, indices_test = train_test_split( indices, test_size=self.test_size, + stratify=stratifier, random_state=self._seeds, ) + for _ in self.estimator_.estimators_: yield indices_train, indices_test @@ -227,9 +235,12 @@ def train_test_samples_(self): if self._n_samples_ is None: raise RuntimeError("The estimator must be fitted before accessing this attribute.") + # Stratifier uses a cached _y attribute if available + stratifier = self._y if is_classifier(self.estimator_) and self.stratify else None + return [ (indices_train, indices_test) - for indices_train, indices_test in self._get_estimators_indices() + for indices_train, indices_test in self._get_estimators_indices(stratifier=stratifier) ] def _statistic( @@ -329,6 +340,8 @@ def statistic( if self._n_samples_ is None: self._n_samples_, self.n_features_in_ = X.shape + + # Infer type of target y if self._type_of_target_ is None: self._type_of_target_ = type_of_target(y) @@ -339,9 +352,9 @@ def statistic( self.permuted_estimator_ = self._get_estimator() estimator = self.permuted_estimator_ - # Infer type of target y - if not hasattr(self, "_type_of_target"): - self._type_of_target_ = type_of_target(y) + # Store a cache of the y variable + if is_classifier(self._get_estimator()): + self._y = y.copy() # XXX: this can be improved as an extra fit can be avoided, by just doing error-checking # and then setting the internal meta data structures @@ -462,10 +475,10 @@ def test( observe_posteriors = self.observe_posteriors_ observe_stat = self.observe_stat_ - # next permute the data if covariate_index is None: covariate_index = np.arange(X.shape[1], dtype=int) + # next permute the data permute_stat, permute_posteriors, permute_samples = self.statistic( X, y, @@ -724,9 +737,7 @@ def _statistic( self.permute_per_tree, self._type_of_target_, ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) - ) + for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) ) else: # fitting a forest will only get one unique train/test split @@ -825,6 +836,9 @@ class FeatureImportanceForestClassifier(BaseForestHT): sample_dataset_per_tree : bool, default=False Whether to sample the dataset per tree or per forest. + stratify : bool, default=True + Whether to stratify the samples by class labels. + Attributes ---------- estimator_ : BaseForest @@ -877,6 +891,7 @@ def __init__( test_size=0.2, permute_per_tree=True, sample_dataset_per_tree=True, + stratify=True, ): super().__init__( estimator=estimator, @@ -885,6 +900,7 @@ def __init__( test_size=test_size, permute_per_tree=permute_per_tree, sample_dataset_per_tree=sample_dataset_per_tree, + stratify=stratify, ) def _get_estimator(self): @@ -945,9 +961,7 @@ def _statistic( self.permute_per_tree, self._type_of_target_, ) - for idx, (indices_train, indices_test) in enumerate( - self._get_estimators_indices(sample_separate=True) - ) + for idx, (indices_train, indices_test) in enumerate(self.train_test_samples_) ) else: # fitting a forest will only get one unique train/test split diff --git a/sktree/stats/tests/test_forestht.py b/sktree/stats/tests/test_forestht.py index cecf34b8c..e71e5e09b 100644 --- a/sktree/stats/tests/test_forestht.py +++ b/sktree/stats/tests/test_forestht.py @@ -69,6 +69,38 @@ def test_featureimportance_forest_permute_pertree(sample_dataset_per_tree): est.statistic(iris_X[:n_samples], iris_y[:n_samples], [0, 1.0], metric="mi") +@pytest.mark.parametrize("sample_dataset_per_tree", [True, False]) +def test_featureimportance_forest_stratified(sample_dataset_per_tree): + est = FeatureImportanceForestClassifier( + estimator=RandomForestClassifier( + n_estimators=10, + random_state=seed, + ), + permute_per_tree=True, + test_size=0.7, + random_state=seed, + sample_dataset_per_tree=sample_dataset_per_tree, + ) + n_samples = 100 + est.statistic(iris_X[:n_samples], iris_y[:n_samples], metric="mi") + + _, indices_test = est.train_test_samples_[0] + y_test = iris_y[indices_test] + + assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( + f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" + ) + + est.test(iris_X[:n_samples], iris_y[:n_samples], [0, 1], n_repeats=10, metric="mi") + + _, indices_test = est.train_test_samples_[0] + y_test = iris_y[indices_test] + + assert len(y_test[y_test == 0]) == len(y_test[y_test == 1]), ( + f"{len(y_test[y_test==0])} " f"{len(y_test[y_test==1])}" + ) + + def test_featureimportance_forest_errors(): permute_per_tree = False sample_dataset_per_tree = True