From d7a114413d1f11bf5f7029cd519c9a29a66b1890 Mon Sep 17 00:00:00 2001
From: Christian Lorentzen <lorentzen.ch@gmail.com>
Date: Wed, 11 Oct 2023 20:44:26 +0200
Subject: [PATCH] DOC improve example on cyclic feature engineering (#27501)

---
 .../plot_cyclical_feature_engineering.py      | 57 ++++++++++++++-----
 1 file changed, 42 insertions(+), 15 deletions(-)

diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
index 650f607a0e743..15c4bfab911c9 100644
--- a/examples/applications/plot_cyclical_feature_engineering.py
+++ b/examples/applications/plot_cyclical_feature_engineering.py
@@ -61,14 +61,14 @@
 # .. note::
 #
 #     The fit method of the models used in this notebook all minimize the
-#     mean squared error to estimate the conditional mean instead of the mean
-#     absolute error that would fit an estimator of the conditional median.
-#
-#     When reporting performance measure on the test set in the discussion, we
-#     instead choose to focus on the mean absolute error that is more
-#     intuitive than the (root) mean squared error. Note, however, that the
-#     best models for one metric are also the best for the other in this
-#     study.
+#     mean squared error to estimate the conditional mean.
+#     The absolute error, however, would estimate the conditional median.
+#
+#     Nevertheless, when reporting performance measures on the test set in
+#     the discussion, we choose to focus on the mean absolute error instead
+#     of the (root) mean squared error because it is more intuitive to
+#     interpret. Note, however, that in this study the best models for one
+#     metric are also the best ones in terms of the other metric.
 y = df["count"] / df["count"].max()
 
 # %%
@@ -170,7 +170,10 @@
 # efficiently handle heteorogenous tabular data with a mix of categorical and
 # numerical features as long as the number of samples is large enough.
 #
-# Here, we do minimal ordinal encoding for the categorical variables and then
+# Here, we use the modern
+# :class:`~sklearn.ensemble.HistGradientBoostingRegressor` with native support
+# for categorical features. Therefore, we only do minimal ordinal encoding for
+# the categorical variables and then
 # let the model know that it should treat those as categorical variables by
 # using a dedicated tree splitting rule. Since we use an ordinal encoder, we
 # pass the list of categorical values explicitly to use a logical order when
@@ -213,6 +216,9 @@
         verbose_feature_names_out=False,
     ),
     HistGradientBoostingRegressor(
+        max_iter=300,
+        early_stopping=True,
+        validation_fraction=0.1,
         categorical_features=categorical_columns,
         random_state=42,
     ),
@@ -222,16 +228,26 @@
 #
 # Lets evaluate our gradient boosting model with the mean absolute error of the
 # relative demand averaged across our 5 time-based cross-validation splits:
+import numpy as np
 
 
-def evaluate(model, X, y, cv):
+def evaluate(model, X, y, cv, model_prop=None, model_step=None):
     cv_results = cross_validate(
         model,
         X,
         y,
         cv=cv,
         scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
+        return_estimator=model_prop is not None,
     )
+    if model_prop is not None:
+        if model_step is not None:
+            values = [
+                getattr(m[model_step], model_prop) for m in cv_results["estimator"]
+            ]
+        else:
+            values = [getattr(m, model_prop) for m in cv_results["estimator"]]
+        print(f"Mean model.{model_prop} = {np.mean(values)}")
     mae = -cv_results["test_neg_mean_absolute_error"]
     rmse = -cv_results["test_neg_root_mean_squared_error"]
     print(
@@ -240,9 +256,18 @@ def evaluate(model, X, y, cv):
     )
 
 
-evaluate(gbrt_pipeline, X, y, cv=ts_cv)
+evaluate(
+    gbrt_pipeline,
+    X,
+    y,
+    cv=ts_cv,
+    model_prop="n_iter_",
+    model_step="histgradientboostingregressor",
+)
 
 # %%
+# We see that we set `max_iter` large enough such that early stopping took place.
+#
 # This model has an average error around 4 to 5% of the maximum demand. This is
 # quite good for a first trial without any hyper-parameter tuning! We just had
 # to make the categorical variables explicit. Note that the time related
@@ -258,10 +283,8 @@ def evaluate(model, X, y, cv):
 #
 # As usual for linear models, categorical variables need to be one-hot encoded.
 # For consistency, we scale the numerical features to the same 0-1 range using
-# class:`sklearn.preprocessing.MinMaxScaler`, although in this case it does not
+# :class:`~sklearn.preprocessing.MinMaxScaler`, although in this case it does not
 # impact the results much because they are already on comparable scales:
-import numpy as np
-
 from sklearn.linear_model import RidgeCV
 from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
 
@@ -278,10 +301,14 @@ def evaluate(model, X, y, cv):
 )
 
 
-evaluate(naive_linear_pipeline, X, y, cv=ts_cv)
+evaluate(
+    naive_linear_pipeline, X, y, cv=ts_cv, model_prop="alpha_", model_step="ridgecv"
+)
 
 
 # %%
+# It is affirmative to see that the selected `alpha_` is in our specified
+# range.
 #
 # The performance is not good: the average error is around 14% of the maximum
 # demand. This is more than three times higher than the average error of the