docs updates [skip ci] (1664)

neurodata · Oct 24, 2023 · 937b6f5 · 937b6f5
1 parent d2dbf8a
commit 937b6f5
Show file tree

Hide file tree

Showing 139 changed files with 5,354 additions and 250 deletions.
diff --git a/dev/.buildinfo b/dev/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 9656cdf230e2bbbdfe34908ac8c0eff0
+config: 3dbf46ee416248617a9be01c51b6f5e5
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/dev/_downloads/04ec4681296d6b5e485ace6864fed6d5/plot_quantile_toy_example_with_RF.ipynb b/dev/_downloads/04ec4681296d6b5e485ace6864fed6d5/plot_quantile_toy_example_with_RF.ipynb
@@ -0,0 +1,97 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Quantile prediction with Random Forest Regressor class\n\nAn example that demonstrates how to use the Random Forest to generate\nquantile predictions such as conditional median and prediction intervals.\nThe example compares the predictions to a ground truth function used\nto generate noisy samples.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from collections import defaultdict\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.model_selection import train_test_split"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Generate the data\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "def make_toy_dataset(n_samples, seed=0):\n    rng = np.random.RandomState(seed)\n\n    x = rng.uniform(0, 10, size=n_samples)\n    f = x * np.sin(x)\n\n    sigma = 0.25 + x / 10\n    noise = rng.lognormal(sigma=sigma) - np.exp(sigma**2 / 2)\n    y = f + noise\n\n    return np.atleast_2d(x).T, y\n\n\nn_samples = 1000\nX, y = make_toy_dataset(n_samples)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)\n\nxx = np.atleast_2d(np.linspace(0, 10, n_samples)).T"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Fit the model to the training samples\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "rf = RandomForestRegressor(max_depth=3, random_state=0)\nrf.fit(X_train, y_train)\n\ny_pred = rf.predict(xx)\n\n# get the leaf nodes that each sample fell into\nleaf_ids = rf.apply(X_train)\n# create a list of dictionary that maps node to samples that fell into it\n# for each tree\nnode_to_indices = []\nfor tree in range(leaf_ids.shape[1]):\n    d = defaultdict(list)\n    for id, leaf in enumerate(leaf_ids[:, tree]):\n        d[leaf].append(id)\n    node_to_indices.append(d)\n# drop the X_test to the trained tree and\n# get the indices of leaf nodes that fall into it\nleaf_ids_test = rf.apply(xx)\n# for each samples, collect the indices of the samples that fell into\n# the same leaf node for each tree\ny_pred_quatile = []\nfor sample in range(leaf_ids_test.shape[0]):\n    li = [\n        node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])\n    ]\n    # merge the list of indices into one\n    idx = [item for sublist in li for item in sublist]\n    # get the y_train for each corresponding id\n    y_pred_quatile.append(y_train[idx])\n# get the quatile preditions for each predicted sample\ny_pred_low = [np.quantile(y_pred_quatile[i], 0.025) for i in range(len(y_pred_quatile))]\ny_pred_med = [np.quantile(y_pred_quatile[i], 0.5) for i in range(len(y_pred_quatile))]\ny_pred_upp = [np.quantile(y_pred_quatile[i], 0.975) for i in range(len(y_pred_quatile))]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot the results\nPlot the conditional median and prediction intervals.\nThe blue line is the predicted median and the shaded area indicates the 95% confidence interval\nof the prediction. The dots are the training data and the black line indicates the function that\nis used to generated those samples.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "plt.plot(X_test, y_test, \".\", c=\"#f2a619\", label=\"Test Observations\", ms=5)\nplt.plot(xx, (xx * np.sin(xx)), c=\"black\", label=\"$f(x) = x\\,\\sin(x)$\", lw=2)\nplt.plot(xx, y_pred_med, c=\"#006aff\", label=\"Predicted Median\", lw=3, ms=5)\nplt.fill_between(\n    xx.ravel(),\n    y_pred_low,\n    y_pred_upp,\n    color=\"#e0f2ff\",\n    label=\"Predicted 95% Interval\",\n)\nplt.xlabel(\"$x$\")\nplt.ylabel(\"$f(x)$\")\nplt.legend(loc=\"upper left\")\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.18"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip b/dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
diff --git a/..._downloads/0b59a1e975638657f27915950026fd6a/plot_quantile_regression_intervals_with_RF.py b/..._downloads/0b59a1e975638657f27915950026fd6a/plot_quantile_regression_intervals_with_RF.py
@@ -0,0 +1,193 @@
+"""
+==========================================================
+Quantile prediction intervals with Random Forest Regressor
+==========================================================
+
+An example of how to generate quantile prediction intervals with
+Random Forest Regressor class on the California Housing dataset.
+The plot compares the conditional median with the quantile prediction intervals, i.e. prediction at
+quantile parameter being 0.025, 0.5 and 0.975. This allows us to generate predictions at 95%
+intervals with upper and lower bounds.
+
+"""
+
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import FuncFormatter
+from sklearn import datasets
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.model_selection import KFold
+from sklearn.utils.validation import check_random_state
+
+# %%
+# Quantile Prediction Function
+# ----------------------------
+#
+# The following function is used to generate quantile predictions using the samples
+# that fall into the same leaf node. We collect the corresponding values for each sample and
+# use those as the bases for making quantile predictions.
+# The function takes the following arguments:
+# 1. estimator :class:`~sklearn.ensemble.RandomForestRegressor` estimator or any other variations.
+# 2. X_train : training data to be used to train the tree.
+# 3. X_test : testing data to be used to predict the quantiles.
+# 4. y_train : training labels to be used to train the tree and to make quantile predictions.
+# 5. quantiles : list of quantiles to be predicted.
+
+
+# function to calculate the quantile predictions
+def get_quantile_prediction(estimator, X_train, X_test, y_train, quantiles=[0.5]):
+    estimator.fit(X_train, y_train)
+    # get the leaf nodes that each sample fell into
+    leaf_ids = estimator.apply(X_train)
+    # create a list of dictionary that maps node to samples that fell into it
+    # for each tree
+    node_to_indices = []
+    for tree in range(leaf_ids.shape[1]):
+        d = defaultdict(list)
+        for id, leaf in enumerate(leaf_ids[:, tree]):
+            d[leaf].append(id)
+        node_to_indices.append(d)
+    # drop the X_test to the trained tree and
+    # get the indices of leaf nodes that fall into it
+    leaf_ids_test = estimator.apply(X_test)
+    # for each samples, collect the indices of the samples that fell into
+    # the same leaf node for each tree
+    y_pred_quantile = []
+    for sample in range(leaf_ids_test.shape[0]):
+        li = [
+            node_to_indices[tree][leaf_ids_test[sample][tree]]
+            for tree in range(leaf_ids_test.shape[1])
+        ]
+        # merge the list of indices into one
+        idx = [item for sublist in li for item in sublist]
+        # get the y_train for each corresponding id``
+        y_pred_quantile.append(y_train[idx])
+    # get the quatile preditions for each predicted sample
+    y_preds = [
+        [np.quantile(y_pred_quantile[i], quantile) for i in range(len(y_pred_quantile))]
+        for quantile in quantiles
+    ]
+    return y_preds
+
+
+rng = check_random_state(0)
+
+dollar_formatter = FuncFormatter(lambda x, p: "$" + format(int(x), ","))
+
+# %%
+# Load the California Housing Prices dataset.
+
+california = datasets.fetch_california_housing()
+n_samples = min(california.target.size, 1000)
+perm = rng.permutation(n_samples)
+X = california.data[perm]
+y = california.target[perm]
+
+rf = RandomForestRegressor(n_estimators=100, random_state=0)
+
+kf = KFold(n_splits=5)
+kf.get_n_splits(X)
+
+y_true = []
+y_pred = []
+y_pred_lower = []
+y_pred_upper = []
+
+for train_index, test_index in kf.split(X):
+    X_train, X_test, y_train, y_test = (
+        X[train_index],
+        X[test_index],
+        y[train_index],
+        y[test_index],
+    )
+
+    rf.set_params(max_features=X_train.shape[1] // 3)
+
+    # Get predictions at 95% prediction intervals and median.
+    y_pred_i = get_quantile_prediction(rf, X_train, X_test, y_train, quantiles=[0.025, 0.5, 0.975])
+
+    y_true = np.concatenate((y_true, y_test))
+    y_pred = np.concatenate((y_pred, y_pred_i[1]))
+    y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[0]))
+    y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[2]))
+
+# Scale data to dollars.
+y_true *= 1e5
+y_pred *= 1e5
+y_pred_lower *= 1e5
+y_pred_upper *= 1e5
+
+# %%
+# Plot the results
+# ----------------
+# Plot the conditional median and prediction intervals.
+# The left plot shows the predicted  (conditional median) with the confidence intervals at 95%
+# against the training data. The upper and lower bounds are indicated with the blue lines segments.
+# The right plot shows showed the same prediction sorted by the predicted value and centered at the
+# halfway point between the upper and lower bounds. This allows us to see the distribution of the
+# confidence intervals, which increases as the variance of the predicted value increases.
+
+fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))
+
+y_pred_interval = y_pred_upper - y_pred_lower
+sort_idx = np.argsort(y_pred)
+y_true = y_true[sort_idx]
+y_pred = y_pred[sort_idx]
+y_pred_lower = y_pred_lower[sort_idx]
+y_pred_upper = y_pred_upper[sort_idx]
+y_min = min(np.minimum(y_true, y_pred))
+y_max = max(np.maximum(y_true, y_pred))
+y_min = float(np.round((y_min / 10000) - 1, 0) * 10000)
+y_max = float(np.round((y_max / 10000) - 1, 0) * 10000)
+
+for low, mid, upp in zip(y_pred_lower, y_pred, y_pred_upper):
+    ax1.plot([mid, mid], [low, upp], lw=4, c="#e0f2ff")
+
+ax1.plot(y_pred, y_true, c="#f2a619", lw=0, marker=".", ms=5)
+ax1.plot(y_pred, y_pred_lower, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
+ax1.plot(y_pred, y_pred_upper, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4)
+ax1.plot([y_min, y_max], [y_min, y_max], ls="--", lw=1, c="grey")
+ax1.grid(axis="x", color="0.95")
+ax1.grid(axis="y", color="0.95")
+ax1.xaxis.set_major_formatter(dollar_formatter)
+ax1.yaxis.set_major_formatter(dollar_formatter)
+ax1.set_xlim(y_min, y_max)
+ax1.set_ylim(y_min, y_max)
+ax1.set_xlabel("Fitted Values (Conditional Median)")
+ax1.set_ylabel("Observed Values")
+
+y_pred_interval = y_pred_upper - y_pred_lower
+sort_idx = np.argsort(y_pred_interval)
+y_true = y_true[sort_idx]
+y_pred_lower = y_pred_lower[sort_idx]
+y_pred_upper = y_pred_upper[sort_idx]
+
+# Center data, with the mean of the prediction interval at 0.
+mean = (y_pred_lower + y_pred_upper) / 2
+y_true -= mean
+y_pred_lower -= mean
+y_pred_upper -= mean
+
+ax2.plot(y_true, c="#f2a619", lw=0, marker=".", ms=5)
+ax2.fill_between(
+    np.arange(len(y_pred_upper)),
+    y_pred_lower,
+    y_pred_upper,
+    alpha=0.8,
+    color="#e0f2ff",
+)
+ax2.plot(np.arange(n_samples), y_pred_lower, alpha=0.8, c="#006aff", lw=2)
+ax2.plot(np.arange(n_samples), y_pred_upper, alpha=0.8, c="#006aff", lw=2)
+ax2.grid(axis="x", color="0.95")
+ax2.grid(axis="y", color="0.95")
+ax2.yaxis.set_major_formatter(dollar_formatter)
+ax2.set_xlim([0, n_samples])
+ax2.set_xlabel("Ordered Samples")
+ax2.set_ylabel("Observed Values and Prediction Intervals")
+
+plt.subplots_adjust(top=0.15)
+fig.tight_layout(pad=3)
+
+plt.show()
diff --git a/...downloads/1bbf8dd9c72a23d179c76280f952170a/plot_quantile_vs_standard_oblique_forest.ipynb b/...downloads/1bbf8dd9c72a23d179c76280f952170a/plot_quantile_vs_standard_oblique_forest.ipynb
@@ -0,0 +1,97 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Quantile regression vs. standard and oblique regression forest\n\nAn example to generate quantile predictions using an oblique random forest\ninstance on a synthetic, right-skewed dataset.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from collections import defaultdict\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport scipy as sp\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.validation import check_random_state\n\nfrom sktree.ensemble import ObliqueRandomForestRegressor\n\nrng = check_random_state(0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate the data\nWe use a synthetic dataset with 2 features and 5000 samples. The target is\ngenerated from a skewed normal distribution. (The mean of the distribution\nis to the right of the median.)\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "n_samples = 5000\na, loc, scale = 5, -1, 1\nskewnorm_rv = sp.stats.skewnorm(a, loc, scale)\nskewnorm_rv.random_state = rng\ny = skewnorm_rv.rvs(n_samples)\nX = rng.randn(n_samples, 2) * y.reshape(-1, 1)\n\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)\n\nregr_orf = ObliqueRandomForestRegressor(n_estimators=10, random_state=0)\n\nregr_orf.fit(X_train, y_train)\n\ny_pred_orf = regr_orf.predict(X_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Generate Quantile Predictions\nThe idea is for each prediction, the training samples that fell into the same leaf nodes\nare collected then used to generate the quantile statistics for the desired prediction.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "# Get the leaf-nodes the training samples fall into\nleaf_ids = regr_orf.apply(X_train)\n# create a list of dictionary that maps node to samples that fell into it\n# for each tree\nnode_to_indices = []\nfor tree in range(leaf_ids.shape[1]):\n    d = defaultdict(list)\n    for id, leaf in enumerate(leaf_ids[:, tree]):\n        d[leaf].append(id)\n    node_to_indices.append(d)\n# drop the X_test to the trained tree and\n# get the indices of leaf nodes that fall into it\nleaf_ids_test = regr_orf.apply(X_test)\n# for each samples, collect the indices of the samples that fell into\n# the same leaf node for each tree\ny_pred_quantile = []\nfor sample in range(leaf_ids_test.shape[0]):\n    li = [\n        node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1])\n    ]\n    # merge the list of indices into one\n    idx = [item for sublist in li for item in sublist]\n    # get the y_train for each corresponding id\n    y_pred_quantile.append(y_train[idx])\n# get the quatile preditions for each predicted sample\ny_pred_quantile = [np.quantile(y_pred_quantile[i], 0.5) for i in range(len(y_pred_quantile))]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot the results\nThe plot shows the distribution of the actual target values and the predicted median\n(i.e. 0.5 quantile), and the mean prediction by the regular random forest regressor.\nIn this skewed dataset, the median prediction using the quantile method works better at\npredicting the off-centered target distribution than the regular mean prediction.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "colors = [\"#c0c0c0\", \"#a6e5ff\", \"#e7a4f5\"]\nnames = [\"Actual\", \"QRF (Median)\", \"ORF (Mean)\"]\nplt.hist([y_test, y_pred_quantile, y_pred_orf], bins=50, color=colors, label=names)\nplt.xlabel(\"Actual and Predicted Target Values\")\nplt.ylabel(\"Counts\")\nplt.legend()\nplt.show()"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.18"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}