From c206463490b1f7c140379180c1b1526caac72dd7 Mon Sep 17 00:00:00 2001
From: Adam Li <adam2392@gmail.com>
Date: Thu, 12 Oct 2023 15:53:34 -0400
Subject: [PATCH] Organize examples

Signed-off-by: Adam Li <adam2392@gmail.com>
---
 examples/calibration/README.txt               |   6 +
 .../plot_overlapping_gaussians.py             |   0
 ...gigantic_hypothesis_testing_forest copy.py | 138 ------------------
 examples/plot_multiview_dtc.py                | 127 ----------------
 examples/sklearn_vs_sktree/README.txt         |   6 +
 .../{ => sklearn_vs_sktree}/plot_iris_dtc.py  |   0
 examples/sparse_oblique_trees/README.txt      |   6 +
 .../plot_extra_oblique_random_forest.py       |   0
 .../plot_extra_orf_sample_size.py             |   0
 ...ique_axis_aligned_forests_sparse_parity.py |   0
 .../plot_oblique_forests_iris.py              |   0
 .../plot_oblique_random_forest.py             |   0
 12 files changed, 18 insertions(+), 265 deletions(-)
 create mode 100644 examples/calibration/README.txt
 rename examples/{ => calibration}/plot_overlapping_gaussians.py (100%)
 delete mode 100644 examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py
 delete mode 100644 examples/plot_multiview_dtc.py
 create mode 100644 examples/sklearn_vs_sktree/README.txt
 rename examples/{ => sklearn_vs_sktree}/plot_iris_dtc.py (100%)
 create mode 100644 examples/sparse_oblique_trees/README.txt
 rename examples/{ => sparse_oblique_trees}/plot_extra_oblique_random_forest.py (100%)
 rename examples/{ => sparse_oblique_trees}/plot_extra_orf_sample_size.py (100%)
 rename examples/{ => sparse_oblique_trees}/plot_oblique_axis_aligned_forests_sparse_parity.py (100%)
 rename examples/{ => sparse_oblique_trees}/plot_oblique_forests_iris.py (100%)
 rename examples/{ => sparse_oblique_trees}/plot_oblique_random_forest.py (100%)

diff --git a/examples/calibration/README.txt b/examples/calibration/README.txt
new file mode 100644
index 000000000..84d5f5758
--- /dev/null
+++ b/examples/calibration/README.txt
@@ -0,0 +1,6 @@
+.. _calibration_examples:
+
+Calibrated decision trees via honesty
+-------------------------------------
+
+Examples demonstrating the usage of honest decision trees to obtain calibrated predictions.
diff --git a/examples/plot_overlapping_gaussians.py b/examples/calibration/plot_overlapping_gaussians.py
similarity index 100%
rename from examples/plot_overlapping_gaussians.py
rename to examples/calibration/plot_overlapping_gaussians.py
diff --git a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py b/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py
deleted file mode 100644
index 423bc63dc..000000000
--- a/examples/hypothesis_testing/plot_MI_gigantic_hypothesis_testing_forest copy.py	
+++ /dev/null
@@ -1,138 +0,0 @@
-"""
-===========================================================
-Mutual Information for Gigantic Hypothesis Testing (MIGHT)
-===========================================================
-
-An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
-multivariate hypothesis test, on simulated datasets. Here, we present a simulation
-of how MIGHT is used to test the hypothesis that a "feature set is important for
-predicting the target". This is a generalization of the framework presented in
-:footcite:`coleman2022scalable`.
-
-We simulate a dataset with 1000 features, 500 samples, and a binary class target
-variable. Within each feature set, there is 500 features associated with one feature
-set, and another 500 features associated with another feature set. One could think of
-these for example as different datasets collected on the same patient in a biomedical setting.
-The first feature set (X) is strongly correlated with the target, and the second
-feature set (W) is weakly correlated with the target (y). Here, we are testing the
-null hypothesis:
-
-- ``H0: I(X; y) - I(X, W; y) = 0``
-- ``HA: I(X; y) - I(X, W; y) < 0`` indicating that there is more mutual information with
-    respect to ``y``
-
-where ``I`` is mutual information. For example, this could be true in the following settings,
-where X is our informative feature set and W is our uninformative feature set.
-
-- ``W    X -> y``: here ``W`` is completely disconnected from X and y.
-- ``W -> X -> y``: here ``W`` is d-separated from y given X.
-- ``W <- X -> y``: here ``W`` is d-separated from y given X.
-
-We then use MIGHT to test the hypothesis that the first feature set is important for
-predicting the target, and the second feature set is not important for predicting the
-target. We use :class:`~sktree.stats.FeatureImportanceForestClassifier`.
-"""
-
-import numpy as np
-from scipy.special import expit
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 1000
-n_features_set = 500
-mean = 1.0
-sigma = 2.0
-beta = 5.0
-
-unimportant_mean = 0.0
-unimportant_sigma = 4.5
-
-# first sample the informative features, and then the uniformative features
-X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
-X_important = np.hstack(
-    [
-        X_important,
-        rng.normal(
-            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
-        ),
-    ]
-)
-
-X_unimportant = rng.normal(
-    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
-)
-X = np.hstack([X_important, X_unimportant])
-
-# simulate the binary target variable
-y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
-
-# %%
-# Perform hypothesis testing using Mutual Information
-# ---------------------------------------------------
-# Here, we use :class:`~sktree.stats.FeatureImportanceForestClassifier` to perform the hypothesis
-# test. The test statistic is computed by comparing the metric (i.e. mutual information) estimated
-# between two forests. One forest is trained on the original dataset, and one forest is trained
-# on a permuted dataset, where the rows of the ``covariate_index`` columns are shuffled randomly.
-#
-# The null distribution is then estimated in an efficient manner using the framework of
-# :footcite:`coleman2022scalable`. The sample evaluations of each forest (i.e. the posteriors)
-# are sampled randomly ``n_repeats`` times to generate a null distribution. The pvalue is then
-# computed as the proportion of samples in the null distribution that are less than the
-# observed test statistic.
-
-n_estimators = 200
-max_features = "sqrt"
-test_size = 0.2
-n_repeats = 1000
-n_jobs = -1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=0.7,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    permute_per_tree=True,
-    sample_dataset_per_tree=False,
-)
-
-print(
-    f"Permutation per tree: {est.permute_per_tree} and sampling dataset per tree: "
-    f"{est.sample_dataset_per_tree}"
-)
-# we test for the first feature set, which is important and thus should return a pvalue < 0.05
-stat, pvalue = est.test(
-    X, y, covariate_index=np.arange(n_features_set, dtype=int), metric="mi", n_repeats=n_repeats
-)
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# we test for the second feature set, which is unimportant and thus should return a pvalue > 0.05
-stat, pvalue = est.test(
-    X,
-    y,
-    covariate_index=np.arange(n_features_set, dtype=int) + n_features_set,
-    metric="mi",
-    n_repeats=n_repeats,
-)
-print(f"Estimated MI difference: {stat} with Pvalue: {pvalue}")
-
-# %%
-# References
-# ----------
-# .. footbibliography::
diff --git a/examples/plot_multiview_dtc.py b/examples/plot_multiview_dtc.py
deleted file mode 100644
index bd67940db..000000000
--- a/examples/plot_multiview_dtc.py
+++ /dev/null
@@ -1,127 +0,0 @@
-"""
-============================================================
-Analyze a multi-view dataset with a multi-view random forest
-============================================================
-
-An example using :class:`~sktree.stats.FeatureImportanceForestClassifier` for nonparametric
-multivariate hypothesis test, on simulated datasets. Here, we present a simulation
-of how MIGHT is used to evaluate how a "feature set is important for predicting the target".
-
-We simulate a dataset with 1000 features, 500 samples, and a binary class target
-variable. Within each feature set, there is 500 features associated with one feature
-set, and another 500 features associated with another feature set. One could think of
-these for example as different datasets collected on the same patient in a biomedical setting.
-The first feature set (X) is strongly correlated with the target, and the second
-feature set (W) is weakly correlated with the target (y).
-
-We then use MIGHT to calculate the partial AUC of these sets.
-"""
-
-import numpy as np
-from scipy.special import expit
-
-from sktree import HonestForestClassifier
-from sktree.stats import FeatureImportanceForestClassifier
-from sktree.tree import DecisionTreeClassifier
-
-seed = 12345
-rng = np.random.default_rng(seed)
-
-# %%
-# Simulate data
-# -------------
-# We simulate the two feature sets, and the target variable. We then combine them
-# into a single dataset to perform hypothesis testing.
-
-n_samples = 1000
-n_features_set = 500
-mean = 1.0
-sigma = 2.0
-beta = 5.0
-
-unimportant_mean = 0.0
-unimportant_sigma = 4.5
-
-# first sample the informative features, and then the uniformative features
-X_important = rng.normal(loc=mean, scale=sigma, size=(n_samples, 10))
-X_important = np.hstack(
-    [
-        X_important,
-        rng.normal(
-            loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set - 10)
-        ),
-    ]
-)
-
-X_unimportant = rng.normal(
-    loc=unimportant_mean, scale=unimportant_sigma, size=(n_samples, n_features_set)
-)
-
-# simulate the binary target variable
-y = rng.binomial(n=1, p=expit(beta * X_important[:, :10].sum(axis=1)), size=n_samples)
-
-# %%
-# Use partial AUC as test statistic
-# ---------------------------------
-# You can specify the maximum specificity by modifying ``max_fpr`` in ``statistic``.
-
-n_estimators = 125
-max_features = "sqrt"
-metric = "auc"
-test_size = 0.2
-n_jobs = -1
-honest_fraction = 0.7
-max_fpr = 0.1
-
-est = FeatureImportanceForestClassifier(
-    estimator=HonestForestClassifier(
-        n_estimators=n_estimators,
-        max_features=max_features,
-        tree_estimator=DecisionTreeClassifier(),
-        random_state=seed,
-        honest_fraction=honest_fraction,
-        n_jobs=n_jobs,
-    ),
-    random_state=seed,
-    test_size=test_size,
-    permute_per_tree=True,
-    sample_dataset_per_tree=True,
-)
-
-# we test for the first feature set, which is important and thus should return a higher AUC
-stat, posterior_arr, samples = est.statistic(
-    X_important,
-    y,
-    metric=metric,
-    return_posteriors=True,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# Repeat for the second feature set
-# ---------------------------------
-# This feature set has a smaller statistic, which is expected due to its weak correlation.
-
-stat, posterior_arr, samples = est.statistic(
-    X_unimportant,
-    y,
-    metric=metric,
-    return_posteriors=True,
-)
-
-print(f"ASH-90 / Partial AUC: {stat}")
-print(f"Shape of Observed Samples: {samples.shape}")
-print(f"Shape of Tree Posteriors for the positive class: {posterior_arr.shape}")
-
-# %%
-# All posteriors are saved within the model
-# -----------------------------------------
-# Extract the results from the model variables anytime. You can save the model with ``pickle``.
-#
-# ASH-90 / Partial AUC: ``est.observe_stat_``
-# Observed Samples: ``est.observe_samples_``
-# Tree Posteriors for the positive class: ``est.observe_posteriors_`` (n_trees, n_samples_test, 1)
-# True Labels: ``est.y_true_final_``
diff --git a/examples/sklearn_vs_sktree/README.txt b/examples/sklearn_vs_sktree/README.txt
new file mode 100644
index 000000000..d942d71f8
--- /dev/null
+++ b/examples/sklearn_vs_sktree/README.txt
@@ -0,0 +1,6 @@
+.. _sklearn_examples:
+
+Comparing sklearn and sktree decision trees
+-------------------------------------------
+
+Examples demonstrating the difference between sklearn and sktree decision trees.
diff --git a/examples/plot_iris_dtc.py b/examples/sklearn_vs_sktree/plot_iris_dtc.py
similarity index 100%
rename from examples/plot_iris_dtc.py
rename to examples/sklearn_vs_sktree/plot_iris_dtc.py
diff --git a/examples/sparse_oblique_trees/README.txt b/examples/sparse_oblique_trees/README.txt
new file mode 100644
index 000000000..61c596af1
--- /dev/null
+++ b/examples/sparse_oblique_trees/README.txt
@@ -0,0 +1,6 @@
+.. _sporf_examples:
+
+Sparse oblique projections with oblique decision-trees
+------------------------------------------------------
+
+Examples demonstrating learning using oblique random forests.
diff --git a/examples/plot_extra_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py
similarity index 100%
rename from examples/plot_extra_oblique_random_forest.py
rename to examples/sparse_oblique_trees/plot_extra_oblique_random_forest.py
diff --git a/examples/plot_extra_orf_sample_size.py b/examples/sparse_oblique_trees/plot_extra_orf_sample_size.py
similarity index 100%
rename from examples/plot_extra_orf_sample_size.py
rename to examples/sparse_oblique_trees/plot_extra_orf_sample_size.py
diff --git a/examples/plot_oblique_axis_aligned_forests_sparse_parity.py b/examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py
similarity index 100%
rename from examples/plot_oblique_axis_aligned_forests_sparse_parity.py
rename to examples/sparse_oblique_trees/plot_oblique_axis_aligned_forests_sparse_parity.py
diff --git a/examples/plot_oblique_forests_iris.py b/examples/sparse_oblique_trees/plot_oblique_forests_iris.py
similarity index 100%
rename from examples/plot_oblique_forests_iris.py
rename to examples/sparse_oblique_trees/plot_oblique_forests_iris.py
diff --git a/examples/plot_oblique_random_forest.py b/examples/sparse_oblique_trees/plot_oblique_random_forest.py
similarity index 100%
rename from examples/plot_oblique_random_forest.py
rename to examples/sparse_oblique_trees/plot_oblique_random_forest.py