From a52c06dab1e996dce46339b8b3f61e41c5b9c47a Mon Sep 17 00:00:00 2001
From: Niels <94110348+nnansters@users.noreply.github.com>
Date: Mon, 29 Apr 2024 09:48:42 +0100
Subject: [PATCH] Make prediction column optional for performance estimation /
 calculation. (#380)

* Make y_pred column optional for estimated and realized performance + tests

* Update data requirements docs

* Fix _list_missing issues + add "run with prediction" tests

* Fix flake8 issues
---
 docs/tutorials/data_requirements.rst          |  5 +-
 nannyml/performance_calculation/calculator.py | 28 +++++++-
 nannyml/performance_calculation/result.py     |  2 +-
 .../confidence_based/cbpe.py                  | 34 +++++++--
 .../confidence_based/results.py               |  2 +-
 nannyml/sampling_error/summary_stats.py       |  2 +-
 .../test_performance_calculator.py            | 68 ++++++++++++++++++
 .../performance_estimation/CBPE/test_cbpe.py  | 69 +++++++++++++++++++
 8 files changed, 197 insertions(+), 13 deletions(-)
diff --git a/docs/tutorials/data_requirements.rst b/docs/tutorials/data_requirements.rst
index 13e1155af..6467dc03c 100644
--- a/docs/tutorials/data_requirements.rst
+++ b/docs/tutorials/data_requirements.rst
@@ -170,7 +170,7 @@ The :term:`predicted label<Predicted labels>`, retrieved by interpreting (thresh
 In the sample data this is the **y_pred** column.
 
 Required for running :ref:`performance estimation<performance-estimation>` or :ref:`performance calculation<performance-calculation>` on binary classification, multiclass, and regression models.
-
+On binary classification models, it is not required for calculating the **AUROC** and **average precision** metrics.
 
 NannyML Functionality Requirements
 ----------------------------------
@@ -190,7 +190,8 @@ You can see those requirements in the table below:
 | y_pred_proba | Required (reference and analysis)   |                                     |                                     |                                   |                                   |                                   | Required (reference and analysis) |
 +--------------+-------------------------------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
 | y_pred       | | Required (reference and analysis) | Required (reference and analysis)   | | Required (reference and analysis) |                                   |                                   |                                   | Required (reference and analysis) |
-|              | | Not needed for ROC_AUC metric     |                                     | | Not needed for ROC_AUC metric     |                                   |                                   |                                   |                                   |
+|              | | Not needed for ROC_AUC or         |                                     | | Not needed for ROC_AUC or         |                                   |                                   |                                   |                                   |
+|              | | average precision metrics         |                                     | | average precision metrics         |                                   |                                   |                                   |                                   |
 +--------------+-------------------------------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
 | y_true       | Required (reference only)           |  Required (reference only)          | Required (reference and analysis)   |                                   |                                   | Required (reference and analysis) |                                   |
 +--------------+-------------------------------------+-------------------------------------+-------------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+-----------------------------------+
diff --git a/nannyml/performance_calculation/calculator.py b/nannyml/performance_calculation/calculator.py
index 55beb2e01..d780f00ef 100644
--- a/nannyml/performance_calculation/calculator.py
+++ b/nannyml/performance_calculation/calculator.py
@@ -84,8 +84,8 @@ def __init__(
         self,
         metrics: Union[str, List[str]],
         y_true: str,
-        y_pred: str,
         problem_type: Union[str, ProblemType],
+        y_pred: Optional[str] = None,
         y_pred_proba: Optional[ModelOutputsType] = None,
         timestamp_column_name: Optional[str] = None,
         thresholds: Optional[Dict[str, Threshold]] = None,
@@ -105,8 +105,10 @@ def __init__(
             A metric or list of metrics to calculate.
         y_true: str
             The name of the column containing target values.
-        y_pred: str
+        y_pred: Optional[str], default=None
             The name of the column containing your model predictions.
+            This parameter is optional for binary classification cases.
+            When it is not given, only the ROC AUC and Average Precision metrics are supported.
         problem_type: Union[str, ProblemType]
             Determines which method to use. Allowed values are:
 
@@ -211,7 +213,12 @@ def __init__(
             self.problem_type = problem_type
 
         if self.problem_type is not ProblemType.REGRESSION and y_pred_proba is None:
-            raise InvalidArgumentsException(f"'y_pred_proba' can not be 'None' for problem type {ProblemType.value}")
+            raise InvalidArgumentsException(
+                f"'y_pred_proba' can not be 'None' for problem type {self.problem_type.value}"
+            )
+
+        if self.problem_type is not ProblemType.CLASSIFICATION_BINARY and y_pred is None:
+            raise InvalidArgumentsException(f"'y_pred' can not be 'None' for problem type {self.problem_type.value}")
 
         self.thresholds = DEFAULT_THRESHOLDS
         if thresholds:
@@ -236,6 +243,8 @@ def __init__(
             if metric not in SUPPORTED_METRIC_VALUES:
                 raise InvalidArgumentsException(f"Metric '{metric}' is not supported.")
 
+        raise_if_metrics_require_y_pred(metrics, y_pred)
+
         self.metrics: List[Metric] = [
             MetricFactory.create(
                 m,
@@ -387,3 +396,16 @@ def _create_multilevel_index(metric_names: List[str]):
     tuples = chunk_tuples + reconstruction_tuples
 
     return MultiIndex.from_tuples(tuples)
+
+
+def raise_if_metrics_require_y_pred(metrics: List[str], y_pred: Optional[str]):
+    """Raise an exception if metrics require y_pred and y_pred is not set.
+
+    Current metrics that require 'y_pred' are:
+    - roc_auc
+    - average_precision
+    """
+    metrics_that_need_y_pred = [m for m in metrics if m not in ['roc_auc', 'average_precision']]
+
+    if len(metrics_that_need_y_pred) > 0 and y_pred is None:
+        raise InvalidArgumentsException(f"Metrics '{metrics_that_need_y_pred}' require 'y_pred' to be set.")
diff --git a/nannyml/performance_calculation/result.py b/nannyml/performance_calculation/result.py
index 9475dfe01..76b7f20b8 100644
--- a/nannyml/performance_calculation/result.py
+++ b/nannyml/performance_calculation/result.py
@@ -30,7 +30,7 @@ def __init__(
         self,
         results_data: pd.DataFrame,
         problem_type: ProblemType,
-        y_pred: str,
+        y_pred: Optional[str],
         y_pred_proba: Optional[Union[str, Dict[str, str]]],
         y_true: str,
         metrics: List[Metric],
diff --git a/nannyml/performance_estimation/confidence_based/cbpe.py b/nannyml/performance_estimation/confidence_based/cbpe.py
index 52d4aaa39..aabf2dc50 100644
--- a/nannyml/performance_estimation/confidence_based/cbpe.py
+++ b/nannyml/performance_estimation/confidence_based/cbpe.py
@@ -75,10 +75,10 @@ class CBPE(AbstractEstimator):
     def __init__(
         self,
         metrics: Union[str, List[str]],
-        y_pred: str,
         y_pred_proba: ModelOutputsType,
         y_true: str,
         problem_type: Union[str, ProblemType],
+        y_pred: Optional[str] = None,
         timestamp_column_name: Optional[str] = None,
         chunk_size: Optional[int] = None,
         chunk_number: Optional[int] = None,
@@ -103,8 +103,6 @@ def __init__(
                 - For binary classification, pass a single string refering to the model output column.
                 - For multiclass classification, pass a dictionary that maps a class string to the column name
                   model outputs for that class.
-        y_pred: str
-            The name of the column containing your model predictions.
         timestamp_column_name: str, default=None
             The name of the column containing the timestamp of the model prediction.
             If not given, plots will not use a time-based x-axis but will use the index of the chunks instead.
@@ -121,6 +119,8 @@ def __init__(
                 - `accuracy`
                 - `confusion_matrix` - only for binary classification tasks
                 - `business_value` - only for binary classification tasks
+        y_pred: str
+            The name of the column containing your model predictions.
         chunk_size: int, default=None
             Splits the data into chunks containing `chunks_size` observations.
             Only one of `chunk_size`, `chunk_number` or `chunk_period` should be given.
@@ -256,6 +256,9 @@ def __init__(
         else:
             self.problem_type = problem_type
 
+        if self.problem_type is not ProblemType.CLASSIFICATION_BINARY and y_pred is None:
+            raise InvalidArgumentsException(f"'y_pred' can not be 'None' for problem type {self.problem_type.value}")
+
         self.thresholds = DEFAULT_THRESHOLDS
         if thresholds:
             self.thresholds.update(**thresholds)
@@ -263,6 +266,8 @@ def __init__(
         if isinstance(metrics, str):
             metrics = [metrics]
 
+        raise_if_metrics_require_y_pred(metrics, y_pred)
+
         self.metrics = []
         for metric in metrics:
             if metric not in SUPPORTED_METRIC_VALUES:
@@ -341,7 +346,10 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
         if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
-            _list_missing([self.y_pred, self.y_pred_proba], data)
+            required_cols = [self.y_pred_proba]
+            if self.y_pred is not None:
+                required_cols.append(self.y_pred)
+            _list_missing(required_cols, list(data.columns))
 
             # We need uncalibrated data to calculate the realized performance on.
             # https://github.com/NannyML/nannyml/issues/98
@@ -414,7 +422,10 @@ def _fit_binary(self, reference_data: pd.DataFrame) -> CBPE:
         if reference_data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
-        _list_missing([self.y_true, self.y_pred_proba, self.y_pred], list(reference_data.columns))
+        required_cols = [self.y_true, self.y_pred_proba]
+        if self.y_pred is not None:
+            required_cols.append(self.y_pred)
+        _list_missing(required_cols, list(reference_data.columns))
 
         # We need uncalibrated data to calculate the realized performance on.
         # We need realized performance in threshold calculations.
@@ -552,3 +563,16 @@ def _calibrate_predicted_probabilities(
         calibrated_data[predicted_class_proba_column_names[idx]] = calibrated_probas[:, idx]
 
     return calibrated_data
+
+
+def raise_if_metrics_require_y_pred(metrics: List[str], y_pred: Optional[str]):
+    """Raise an exception if metrics require y_pred and y_pred is not set.
+
+    Current metrics that require 'y_pred' are:
+    - roc_auc
+    - average_precision
+    """
+    metrics_that_need_y_pred = [m for m in metrics if m not in ['roc_auc', 'average_precision']]
+
+    if len(metrics_that_need_y_pred) > 0 and y_pred is None:
+        raise InvalidArgumentsException(f"Metrics '{metrics_that_need_y_pred}' require 'y_pred' to be set.")
diff --git a/nannyml/performance_estimation/confidence_based/results.py b/nannyml/performance_estimation/confidence_based/results.py
index 308b897d0..958af9bed 100644
--- a/nannyml/performance_estimation/confidence_based/results.py
+++ b/nannyml/performance_estimation/confidence_based/results.py
@@ -30,7 +30,7 @@ def __init__(
         self,
         results_data: pd.DataFrame,
         metrics: List[Metric],
-        y_pred: str,
+        y_pred: Optional[str],
         y_pred_proba: ModelOutputsType,
         y_true: str,
         chunker: Chunker,
diff --git a/nannyml/sampling_error/summary_stats.py b/nannyml/sampling_error/summary_stats.py
index 36ee53aaa..550341f76 100644
--- a/nannyml/sampling_error/summary_stats.py
+++ b/nannyml/sampling_error/summary_stats.py
@@ -2,12 +2,12 @@
 #
 #  License: Apache Software License 2.0
 
+import warnings
 from logging import getLogger
 from typing import Tuple
 
 import numpy as np
 import pandas as pd
-import warnings
 from scipy.stats import gaussian_kde, moment
 
 logger = getLogger(__name__)
diff --git a/tests/performance_calculation/test_performance_calculator.py b/tests/performance_calculation/test_performance_calculator.py
index ff8ceadd1..be5effa6a 100644
--- a/tests/performance_calculation/test_performance_calculator.py
+++ b/tests/performance_calculation/test_performance_calculator.py
@@ -106,6 +106,58 @@ def test_performance_calculator_create_with_single_or_list_of_metrics(metrics, e
     assert [metric.column_name for metric in calc.metrics] == expected
 
 
+@pytest.mark.parametrize(
+    'problem',
+    [
+        "classification_multiclass",
+        "regression",
+    ],
+)
+def test_performance_calculator_create_raises_exception_when_y_pred_not_given_and_problem_type_not_binary_clf(
+    problem,
+):
+    with pytest.raises(InvalidArgumentsException, match=f"'y_pred' can not be 'None' for problem type {problem}"):
+        _ = PerformanceCalculator(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=['roc_auc', 'f1'],
+            problem_type=problem,
+        )
+
+
+@pytest.mark.parametrize(
+    'metric, expected',
+    [
+        (['roc_auc', 'f1'], "['f1']"),
+        (['roc_auc', 'f1', 'average_precision', 'precision'], "['f1', 'precision']"),
+    ],
+)
+def test_performance_calculator_create_without_y_pred_raises_exception_when_metrics_require_it(metric, expected):
+    with pytest.raises(InvalidArgumentsException, match=expected):
+        _ = PerformanceCalculator(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=metric,
+            problem_type='classification_binary',
+        )
+
+
+@pytest.mark.parametrize('metric', ['roc_auc', 'average_precision'])
+def test_performance_calculator_create_without_y_pred_works_when_metrics_dont_require_it(metric):
+    try:
+        _ = PerformanceCalculator(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=metric,
+            problem_type='classification_binary',
+        )
+    except Exception as exc:
+        pytest.fail(f'unexpected exception: {exc}')
+
+
 def test_calculator_fit_should_raise_invalid_args_exception_when_no_target_data_present(data):  # noqa: D103, F821
     calc = PerformanceCalculator(
         timestamp_column_name='timestamp',
@@ -410,3 +462,19 @@ def test_binary_classification_result_plots_raise_no_exceptions(calc_args, plot_
         _ = sut.plot(**plot_args)
     except Exception as exc:
         pytest.fail(f"an unexpected exception occurred: {exc}")
+
+
+def test_binary_classification_calculate_without_prediction_column():
+    reference, analysis, analysis_targets = load_synthetic_binary_classification_dataset()
+    try:
+        calc = PerformanceCalculator(
+            y_true='work_home_actual',
+            y_pred_proba='y_pred_proba',
+            problem_type=ProblemType.CLASSIFICATION_BINARY,
+            metrics=['roc_auc', 'average_precision'],
+            timestamp_column_name='timestamp',
+            chunk_period='M',
+        ).fit(reference)
+        _ = calc.calculate(analysis.merge(analysis_targets, on='id'))
+    except Exception as exc:
+        pytest.fail(f"an unexpected exception occurred: {exc}")
diff --git a/tests/performance_estimation/CBPE/test_cbpe.py b/tests/performance_estimation/CBPE/test_cbpe.py
index 270dc6012..3436723ac 100644
--- a/tests/performance_estimation/CBPE/test_cbpe.py
+++ b/tests/performance_estimation/CBPE/test_cbpe.py
@@ -64,6 +64,56 @@ def test_cbpe_create_with_single_or_list_of_metrics(metrics, expected):
     assert [metric.name for metric in sut.metrics] == expected
 
 
+@pytest.mark.parametrize(
+    'problem',
+    [
+        "classification_multiclass",
+        "regression",
+    ],
+)
+def test_cbpe_create_raises_exception_when_y_pred_not_given_and_problem_type_not_binary_classification(problem):
+    with pytest.raises(InvalidArgumentsException, match=f"'y_pred' can not be 'None' for problem type {problem}"):
+        _ = CBPE(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=['roc_auc', 'f1'],
+            problem_type=problem,
+        )
+
+
+@pytest.mark.parametrize(
+    'metric, expected',
+    [
+        (['roc_auc', 'f1'], "['f1']"),
+        (['roc_auc', 'f1', 'average_precision', 'precision'], "['f1', 'precision']"),
+    ],
+)
+def test_cbpe_create_without_y_pred_raises_exception_when_metrics_require_it(metric, expected):
+    with pytest.raises(InvalidArgumentsException, match=expected):
+        _ = CBPE(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=metric,
+            problem_type='classification_binary',
+        )
+
+
+@pytest.mark.parametrize('metric', ['roc_auc', 'average_precision'])
+def test_cbpe_create_without_y_pred_works_when_metrics_dont_require_it(metric):
+    try:
+        _ = CBPE(
+            timestamp_column_name='timestamp',
+            y_pred_proba='y_pred_proba',
+            y_true='y_true',
+            metrics=metric,
+            problem_type='classification_binary',
+        )
+    except Exception as exc:
+        pytest.fail(f'unexpected exception: {exc}')
+
+
 def test_cbpe_will_calibrate_scores_when_needed(binary_classification_data):  # noqa: D103
     ref_df = binary_classification_data[0]
 
@@ -652,3 +702,22 @@ def test_cbpe_with_default_thresholds():
     sut = est.thresholds
 
     assert sut == DEFAULT_THRESHOLDS
+
+
+def test_cbpe_without_predictions():
+    ref_df, ana_df, _ = load_synthetic_binary_classification_dataset()
+    try:
+        cbpe = CBPE(
+            y_pred_proba='y_pred_proba',
+            y_true='work_home_actual',
+            problem_type='classification_binary',
+            metrics=[
+                'roc_auc',
+                'average_precision',
+            ],
+            timestamp_column_name='timestamp',
+            chunk_period='M',
+        ).fit(ref_df)
+        _ = cbpe.estimate(ana_df)
+    except Exception as exc:
+        pytest.fail(f'unexpected exception: {exc}')