UPD: remove large files and transform nb to py file

scikit-learn-contrib · Nov 14, 2023 · 8052192 · 8052192
1 parent bd013b9
commit 8052192
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 35,263 deletions.
diff --git a/examples/regression/3-scientific-articles/plot_zaffran2022_comparison.py b/examples/regression/3-scientific-articles/plot_zaffran2022_comparison.py
@@ -0,0 +1,249 @@
+import warnings
+
+from io import BytesIO
+from typing import Any, Optional, Tuple
+from urllib.request import urlopen
+import ssl
+from zipfile import ZipFile
+import pickle
+
+import datetime
+import numpy as np
+import pandas as pd
+from matplotlib import pylab as plt
+from sklearn.ensemble import RandomForestRegressor
+
+from mapie.time_series_regression import MapieTimeSeriesRegressor
+
+from mapie._typing import NDArray
+
+warnings.simplefilter("ignore")
+
+
+#########################################################
+# Global random forests parameters
+#########################################################
+
+def init_model():
+    # the number of trees in the forest
+    n_estimators = 1000
+
+    # the minimum number of samples required to be at a leaf node
+    # (default skgarden's parameter)
+    min_samples_leaf = 1
+
+    # the number of features to consider when looking for the best split
+    # (default skgarden's parameter)
+    max_features = 6
+
+    model = RandomForestRegressor(
+        n_estimators=n_estimators,
+        min_samples_leaf=min_samples_leaf,
+        max_features=max_features,
+        random_state=1
+    )
+
+    return model
+
+
+#########################################################
+# Get data
+#########################################################
+
+def get_data() -> pd.DataFrame:
+    """
+    TODO
+
+    Returns
+    -------
+    TODO
+    """
+    website = "https://raw.githubusercontent.com/"
+    page = "mzaffran/AdaptiveConformalPredictionsTimeSeries/"
+    folder = "131656fe4c25251bad745f52db3c2d7cb1c24bbb/data_prices/"
+    file = "Prices_2016_2019_extract.csv"
+    url = website + page + folder + file
+    ssl._create_default_https_context = ssl._create_unverified_context
+    df = pd.read_csv(url)
+    return df
+
+
+#########################################################
+# Get & Present data
+#########################################################
+
+data = get_data()
+
+date_data = pd.to_datetime(data.Date)
+
+plt.figure(figsize=(10, 5))
+plt.plot(date_data, data.Spot, color='black', linewidth=0.6)
+
+locs, labels = plt.xticks()
+new_labels = ['2016', '2017', '2018', '2019',' 2020']
+plt.xticks(locs[0:len(locs):2], labels=new_labels)
+
+plt.xlabel('Date')
+plt.ylabel('Spot price (\u20AC/MWh)')
+
+plt.show()
+
+#########################################################
+# Prepare data
+#########################################################
+
+limit = datetime.datetime(2019, 1, 1, tzinfo=datetime.timezone.utc)
+id_train = data.index[pd.to_datetime(data['Date'], utc=True) < limit].tolist()
+id_test = data.index[pd.to_datetime(data['Date'], utc=True) >= limit].tolist()
+
+data_train = data.iloc[id_train,:]
+data_test = data.iloc[id_test,:]
+
+features = ['hour'] + ['dow_%d'%i for i in range(7)] \
+    + ['lag_24_%d'%i for i in range(24)] \
+    + ['lag_168_%d'%i for i in range(24)] + ['conso']
+X_train = data_train.loc[:, features]
+y_train = data_train.Spot
+
+X_train_0 = X_train.loc[X_train.hour == 0]
+y_train_0 = data_train.loc[data_train.hour == 0, 'Spot']
+
+X_test = data_test.loc[:, features]
+y_test = data_test.Spot
+
+X_test_0 = X_test.loc[X_test.hour == 0]
+y_test_0 = data_test.loc[data_test.hour == 0, 'Spot']
+
+
+#########################################################
+# Prepare model
+#########################################################
+
+alpha = 0.1
+gap = 1
+iteration_max = 10
+gamma = 0.04
+
+model = init_model()
+mapie_aci = MapieTimeSeriesRegressor(
+    model, method="aci", agg_function="mean", n_jobs=-1
+)
+
+
+#########################################################
+# Reproduce experiment and results
+#########################################################
+
+all_x_train = [
+    np.array(data_train.loc[data_train.hour == h]) for h in range(24)
+]
+
+train_size = all_x_train[0].shape[0]
+idx = np.array(range(train_size))
+n_half = int(np.floor(train_size/2))
+
+X_train_0 = X_train_0[:n_half]
+y_train_0 = y_train_0[:n_half]
+
+mapie_aci = mapie_aci.fit(X_train_0, y_train_0)
+y_pred_aci_npfit, y_pis_aci_npfit = mapie_aci.predict(
+    X_test_0, alpha=alpha, ensemble=True, optimize_beta=False
+)
+print("MAPIE estimator fitted!")
+
+# step 0
+y_pred_aci_pfit = np.zeros(y_pred_aci_npfit.shape)
+y_pis_aci_pfit = np.zeros(y_pis_aci_npfit.shape)
+y_pred_aci_pfit[:gap], y_pis_aci_pfit[:gap, :, :] = mapie_aci.predict(
+    X_test_0.iloc[:gap, :], alpha=alpha, ensemble=True, optimize_beta=False
+)
+
+# step t
+for step in range(1, min(len(X_test_0), iteration_max+1), gap):
+
+    mapie_aci.estimator_.single_estimator_.fit(
+        X_test_0.iloc[(step - gap):step, :],
+        y_test_0.iloc[(step - gap):step]
+    )
+
+    mapie_aci.partial_fit(
+        X_test.iloc[(step - gap):step, :],
+        y_test.iloc[(step - gap):step],
+    )
+
+    mapie_aci.adapt_conformal_inference(
+        X_test_0.iloc[(step - gap):step, :],
+        y_test_0.iloc[(step - gap):step],
+        gamma = gamma
+    )
+    (
+        y_pred_aci_pfit[step:step + gap],
+        y_pis_aci_pfit[step:step + gap, :, :],
+    ) = mapie_aci.predict(
+        X_test_0.iloc[step:(step + gap), :],
+        alpha=alpha,
+        ensemble=True,
+        optimize_beta=True
+    )
+
+results = y_pis_aci_pfit.copy()
+
+
+#########################################################
+# Get referenced result to reproduce
+#########################################################
+
+def get_pickle() -> Tuple[NDArray, NDArray]:
+    """
+    TODO
+
+    Returns
+    -------
+    TODO
+    """
+    website = "https://github.com/"
+    page = "mzaffran/AdaptiveConformalPredictionsTimeSeries/raw/"
+    folder = "131656fe4c25251bad745f52db3c2d7cb1c24bbb/results/"
+    folder += "Spot_France_Hour_0_train_2019-01-01/"
+    file = "ACP_0.04_RF.pkl"
+    url = website + page + folder + file
+    ssl._create_default_https_context = ssl._create_unverified_context
+    try:
+        loaded_data = pickle.load(urlopen(url))
+    except FileNotFoundError:
+        print(f"The file {file} was not found.")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+    return loaded_data
+
+
+data_ref = get_pickle()
+
+
+#########################################################
+# Compare results
+#########################################################
+
+# Flatten the array to shape (n, 2)
+results_ref = np.concatenate(
+    [data_ref['Y_inf'], data_ref['Y_sup']], axis=-1
+).reshape(-1, 2)
+results = np.array(results.reshape(-1, 2))
+
+# Compare the NumPy array with the corresponding DataFrame columns
+comparison_result_Y_inf = np.array_equal(
+    results[:iteration_max, 0],
+    results_ref[:iteration_max, 0]
+)
+comparison_result_Y_sup = np.array_equal(
+    results[:iteration_max, 1],
+    results_ref[:iteration_max, 1]
+)
+
+# Print the comparison results
+print(f"Comparison (Y_inf): {results[:iteration_max, 0]}")
+print(f"Comparison (Y_inf ref): {results_ref[:iteration_max, 0]}")
+print(f"Comparison (Y_sup): {results[:iteration_max, 1]}")
+print(f"Comparison (Y_sup ref): {results_ref[:iteration_max, 1]}")
+print(f"Comparison for ACP_0.04 (Y_inf): {comparison_result_Y_inf}")
+print(f"Comparison for ACP_0.04 (Y_sup): {comparison_result_Y_sup}")