-
-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[DOC] Add Quantile forest examples (#147)
* add quantile prediction examples --------- Co-authored-by: Adam Li <adam2392@gmail.com> Co-authored-by: Sambit Panda <36676569+sampan501@users.noreply.github.com>
- Loading branch information
1 parent
359ea75
commit cedcb1e
Showing
5 changed files
with
505 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
.. _quantile_examples: | ||
|
||
Quantile Predictions with Random Forest | ||
--------------------------------------- | ||
|
||
Examples demonstrating how to generate quantile predictions using Random Forest variants. |
111 changes: 111 additions & 0 deletions
111
examples/quantile_predictions/plot_quantile_interpolation_with_RF.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
""" | ||
======================================================== | ||
Predicting with different quantile interpolation methods | ||
======================================================== | ||
An example comparison of interpolation methods that can be applied during | ||
prediction when the desired quantile lies between two data points. | ||
""" | ||
|
||
from collections import defaultdict | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from sklearn.ensemble import RandomForestRegressor | ||
|
||
# %% | ||
# Generate the data | ||
# ----------------- | ||
# We use four simple data points to illustrate the difference between the intervals that are | ||
# generated using different interpolation methods. | ||
|
||
X = np.array([[-1, -1], [-1, -1], [-1, -1], [1, 1], [1, 1]]) | ||
y = np.array([-2, -1, 0, 1, 2]) | ||
|
||
# %% | ||
# The interpolation methods | ||
# ------------------------- | ||
# The following interpolation methods demonstrated here are: | ||
# To interpolate between the data points, i and j (``i <= j``), | ||
# linear, lower, higher, midpoint, or nearest. For more details, see `sktree.RandomForestRegressor`. | ||
# The difference between the methods can be illustrated with the following example: | ||
|
||
interpolations = ["linear", "lower", "higher", "midpoint", "nearest"] | ||
colors = ["#006aff", "#ffd237", "#0d4599", "#f2a619", "#a6e5ff"] | ||
quantiles = [0.025, 0.5, 0.975] | ||
|
||
y_medians = [] | ||
y_errs = [] | ||
est = RandomForestRegressor( | ||
n_estimators=1, | ||
random_state=0, | ||
) | ||
# fit the model | ||
est.fit(X, y) | ||
# get the leaf nodes that each sample fell into | ||
leaf_ids = est.apply(X) | ||
# create a list of dictionary that maps node to samples that fell into it | ||
# for each tree | ||
node_to_indices = [] | ||
for tree in range(leaf_ids.shape[1]): | ||
d = defaultdict(list) | ||
for id, leaf in enumerate(leaf_ids[:, tree]): | ||
d[leaf].append(id) | ||
node_to_indices.append(d) | ||
# drop the X_test to the trained tree and | ||
# get the indices of leaf nodes that fall into it | ||
leaf_ids_test = est.apply(X) | ||
# for each samples, collect the indices of the samples that fell into | ||
# the same leaf node for each tree | ||
y_pred_quantile = [] | ||
for sample in range(leaf_ids_test.shape[0]): | ||
li = [ | ||
node_to_indices[tree][leaf_ids_test[sample][tree]] for tree in range(leaf_ids_test.shape[1]) | ||
] | ||
# merge the list of indices into one | ||
idx = [item for sublist in li for item in sublist] | ||
# get the y_train for each corresponding id`` | ||
y_pred_quantile.append(y[idx]) | ||
|
||
for interpolation in interpolations: | ||
# get the quatile preditions for each predicted sample | ||
y_pred = [ | ||
np.array( | ||
[ | ||
np.quantile(y_pred_quantile[i], quantile, method=interpolation) | ||
for i in range(len(y_pred_quantile)) | ||
] | ||
) | ||
for quantile in quantiles | ||
] | ||
y_medians.append(y_pred[1]) | ||
y_errs.append( | ||
np.concatenate( | ||
( | ||
[y_pred[1] - y_pred[0]], | ||
[y_pred[2] - y_pred[1]], | ||
), | ||
axis=0, | ||
) | ||
) | ||
|
||
sc = plt.scatter(np.arange(len(y)) - 0.35, y, color="k", zorder=10) | ||
ebs = [] | ||
for i, (median, y_err) in enumerate(zip(y_medians, y_errs)): | ||
ebs.append( | ||
plt.errorbar( | ||
np.arange(len(y)) + (0.15 * (i + 1)) - 0.35, | ||
median, | ||
yerr=y_err, | ||
color=colors[i], | ||
ecolor=colors[i], | ||
fmt="o", | ||
) | ||
) | ||
plt.xlim([-0.75, len(y) - 0.25]) | ||
plt.xticks(np.arange(len(y)), X.tolist()) | ||
plt.xlabel("Samples (Feature Values)") | ||
plt.ylabel("Actual and Predicted Values") | ||
plt.legend([sc] + ebs, ["actual"] + interpolations, loc=2) | ||
plt.show() |
193 changes: 193 additions & 0 deletions
193
examples/quantile_predictions/plot_quantile_regression_intervals_with_RF.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
""" | ||
========================================================== | ||
Quantile prediction intervals with Random Forest Regressor | ||
========================================================== | ||
An example of how to generate quantile prediction intervals with | ||
Random Forest Regressor class on the California Housing dataset. | ||
The plot compares the conditional median with the quantile prediction intervals, i.e. prediction at | ||
quantile parameter being 0.025, 0.5 and 0.975. This allows us to generate predictions at 95% | ||
intervals with upper and lower bounds. | ||
""" | ||
|
||
from collections import defaultdict | ||
|
||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
from matplotlib.ticker import FuncFormatter | ||
from sklearn import datasets | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.model_selection import KFold | ||
from sklearn.utils.validation import check_random_state | ||
|
||
# %% | ||
# Quantile Prediction Function | ||
# ---------------------------- | ||
# | ||
# The following function is used to generate quantile predictions using the samples | ||
# that fall into the same leaf node. We collect the corresponding values for each sample and | ||
# use those as the bases for making quantile predictions. | ||
# The function takes the following arguments: | ||
# 1. estimator :class:`~sklearn.ensemble.RandomForestRegressor` estimator or any other variations. | ||
# 2. X_train : training data to be used to train the tree. | ||
# 3. X_test : testing data to be used to predict the quantiles. | ||
# 4. y_train : training labels to be used to train the tree and to make quantile predictions. | ||
# 5. quantiles : list of quantiles to be predicted. | ||
|
||
|
||
# function to calculate the quantile predictions | ||
def get_quantile_prediction(estimator, X_train, X_test, y_train, quantiles=[0.5]): | ||
estimator.fit(X_train, y_train) | ||
# get the leaf nodes that each sample fell into | ||
leaf_ids = estimator.apply(X_train) | ||
# create a list of dictionary that maps node to samples that fell into it | ||
# for each tree | ||
node_to_indices = [] | ||
for tree in range(leaf_ids.shape[1]): | ||
d = defaultdict(list) | ||
for id, leaf in enumerate(leaf_ids[:, tree]): | ||
d[leaf].append(id) | ||
node_to_indices.append(d) | ||
# drop the X_test to the trained tree and | ||
# get the indices of leaf nodes that fall into it | ||
leaf_ids_test = estimator.apply(X_test) | ||
# for each samples, collect the indices of the samples that fell into | ||
# the same leaf node for each tree | ||
y_pred_quantile = [] | ||
for sample in range(leaf_ids_test.shape[0]): | ||
li = [ | ||
node_to_indices[tree][leaf_ids_test[sample][tree]] | ||
for tree in range(leaf_ids_test.shape[1]) | ||
] | ||
# merge the list of indices into one | ||
idx = [item for sublist in li for item in sublist] | ||
# get the y_train for each corresponding id`` | ||
y_pred_quantile.append(y_train[idx]) | ||
# get the quatile preditions for each predicted sample | ||
y_preds = [ | ||
[np.quantile(y_pred_quantile[i], quantile) for i in range(len(y_pred_quantile))] | ||
for quantile in quantiles | ||
] | ||
return y_preds | ||
|
||
|
||
rng = check_random_state(0) | ||
|
||
dollar_formatter = FuncFormatter(lambda x, p: "$" + format(int(x), ",")) | ||
|
||
# %% | ||
# Load the California Housing Prices dataset. | ||
|
||
california = datasets.fetch_california_housing() | ||
n_samples = min(california.target.size, 1000) | ||
perm = rng.permutation(n_samples) | ||
X = california.data[perm] | ||
y = california.target[perm] | ||
|
||
rf = RandomForestRegressor(n_estimators=100, random_state=0) | ||
|
||
kf = KFold(n_splits=5) | ||
kf.get_n_splits(X) | ||
|
||
y_true = [] | ||
y_pred = [] | ||
y_pred_lower = [] | ||
y_pred_upper = [] | ||
|
||
for train_index, test_index in kf.split(X): | ||
X_train, X_test, y_train, y_test = ( | ||
X[train_index], | ||
X[test_index], | ||
y[train_index], | ||
y[test_index], | ||
) | ||
|
||
rf.set_params(max_features=X_train.shape[1] // 3) | ||
|
||
# Get predictions at 95% prediction intervals and median. | ||
y_pred_i = get_quantile_prediction(rf, X_train, X_test, y_train, quantiles=[0.025, 0.5, 0.975]) | ||
|
||
y_true = np.concatenate((y_true, y_test)) | ||
y_pred = np.concatenate((y_pred, y_pred_i[1])) | ||
y_pred_lower = np.concatenate((y_pred_lower, y_pred_i[0])) | ||
y_pred_upper = np.concatenate((y_pred_upper, y_pred_i[2])) | ||
|
||
# Scale data to dollars. | ||
y_true *= 1e5 | ||
y_pred *= 1e5 | ||
y_pred_lower *= 1e5 | ||
y_pred_upper *= 1e5 | ||
|
||
# %% | ||
# Plot the results | ||
# ---------------- | ||
# Plot the conditional median and prediction intervals. | ||
# The left plot shows the predicted (conditional median) with the confidence intervals at 95% | ||
# against the training data. The upper and lower bounds are indicated with the blue lines segments. | ||
# The right plot shows showed the same prediction sorted by the predicted value and centered at the | ||
# halfway point between the upper and lower bounds. This allows us to see the distribution of the | ||
# confidence intervals, which increases as the variance of the predicted value increases. | ||
|
||
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10, 4)) | ||
|
||
y_pred_interval = y_pred_upper - y_pred_lower | ||
sort_idx = np.argsort(y_pred) | ||
y_true = y_true[sort_idx] | ||
y_pred = y_pred[sort_idx] | ||
y_pred_lower = y_pred_lower[sort_idx] | ||
y_pred_upper = y_pred_upper[sort_idx] | ||
y_min = min(np.minimum(y_true, y_pred)) | ||
y_max = max(np.maximum(y_true, y_pred)) | ||
y_min = float(np.round((y_min / 10000) - 1, 0) * 10000) | ||
y_max = float(np.round((y_max / 10000) - 1, 0) * 10000) | ||
|
||
for low, mid, upp in zip(y_pred_lower, y_pred, y_pred_upper): | ||
ax1.plot([mid, mid], [low, upp], lw=4, c="#e0f2ff") | ||
|
||
ax1.plot(y_pred, y_true, c="#f2a619", lw=0, marker=".", ms=5) | ||
ax1.plot(y_pred, y_pred_lower, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4) | ||
ax1.plot(y_pred, y_pred_upper, alpha=0.4, c="#006AFF", lw=0, marker="_", ms=4) | ||
ax1.plot([y_min, y_max], [y_min, y_max], ls="--", lw=1, c="grey") | ||
ax1.grid(axis="x", color="0.95") | ||
ax1.grid(axis="y", color="0.95") | ||
ax1.xaxis.set_major_formatter(dollar_formatter) | ||
ax1.yaxis.set_major_formatter(dollar_formatter) | ||
ax1.set_xlim(y_min, y_max) | ||
ax1.set_ylim(y_min, y_max) | ||
ax1.set_xlabel("Fitted Values (Conditional Median)") | ||
ax1.set_ylabel("Observed Values") | ||
|
||
y_pred_interval = y_pred_upper - y_pred_lower | ||
sort_idx = np.argsort(y_pred_interval) | ||
y_true = y_true[sort_idx] | ||
y_pred_lower = y_pred_lower[sort_idx] | ||
y_pred_upper = y_pred_upper[sort_idx] | ||
|
||
# Center data, with the mean of the prediction interval at 0. | ||
mean = (y_pred_lower + y_pred_upper) / 2 | ||
y_true -= mean | ||
y_pred_lower -= mean | ||
y_pred_upper -= mean | ||
|
||
ax2.plot(y_true, c="#f2a619", lw=0, marker=".", ms=5) | ||
ax2.fill_between( | ||
np.arange(len(y_pred_upper)), | ||
y_pred_lower, | ||
y_pred_upper, | ||
alpha=0.8, | ||
color="#e0f2ff", | ||
) | ||
ax2.plot(np.arange(n_samples), y_pred_lower, alpha=0.8, c="#006aff", lw=2) | ||
ax2.plot(np.arange(n_samples), y_pred_upper, alpha=0.8, c="#006aff", lw=2) | ||
ax2.grid(axis="x", color="0.95") | ||
ax2.grid(axis="y", color="0.95") | ||
ax2.yaxis.set_major_formatter(dollar_formatter) | ||
ax2.set_xlim([0, n_samples]) | ||
ax2.set_xlabel("Ordered Samples") | ||
ax2.set_ylabel("Observed Values and Prediction Intervals") | ||
|
||
plt.subplots_adjust(top=0.15) | ||
fig.tight_layout(pad=3) | ||
|
||
plt.show() |
Oops, something went wrong.