From 55ff6a32211c258aa13786f6e45b46fe25246822 Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Sun, 19 Nov 2023 19:00:53 +0100
Subject: [PATCH 1/7] Add column & method for univariate fitting errors

---
 nannyml/drift/univariate/calculator.py | 63 +++++++++++++++-----------
 1 file changed, 37 insertions(+), 26 deletions(-)

diff --git a/nannyml/drift/univariate/calculator.py b/nannyml/drift/univariate/calculator.py
index 6f2052989..1ce569b3c 100644
--- a/nannyml/drift/univariate/calculator.py
+++ b/nannyml/drift/univariate/calculator.py
@@ -39,7 +39,7 @@
 from nannyml.chunk import Chunker
 from nannyml.drift.univariate.methods import FeatureType, Method, MethodFactory
 from nannyml.drift.univariate.result import Result
-from nannyml.exceptions import InvalidArgumentsException
+from nannyml.exceptions import CalculatorException, InvalidArgumentsException
 from nannyml.thresholds import ConstantThreshold, StandardDeviationThreshold, Threshold
 from nannyml.usage_logging import UsageEvent, log_usage
 
@@ -271,34 +271,45 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> UnivariateDrift
             if column_name not in self.categorical_column_names:
                 self.categorical_column_names.append(column_name)
 
+        timestamps = reference_data[self.timestamp_column_name] if self.timestamp_column_name else None
         for column_name in self.continuous_column_names:
-            self._column_to_models_mapping[column_name] += [
-                MethodFactory.create(
-                    key=method,
-                    feature_type=FeatureType.CONTINUOUS,
-                    chunker=self.chunker,
-                    computation_params=self.computation_params or {},
-                    threshold=self.thresholds[method],
-                ).fit(
-                    reference_data=reference_data[column_name],
-                    timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
-                )
-                for method in self.continuous_method_names
-            ]
+            methods = []
+            for method in self.continuous_method_names:
+                try:
+                    methods.append(
+                        MethodFactory.create(
+                            key=method,
+                            feature_type=FeatureType.CONTINUOUS,
+                            chunker=self.chunker,
+                            computation_params=self.computation_params or {},
+                            threshold=self.thresholds[method],
+                        ).fit(
+                            reference_data=reference_data[column_name],
+                            timestamps=timestamps,
+                        )
+                    )
+                except Exception as ex:
+                    raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
+            self._column_to_models_mapping[column_name] = methods
 
         for column_name in self.categorical_column_names:
-            self._column_to_models_mapping[column_name] += [
-                MethodFactory.create(
-                    key=method,
-                    feature_type=FeatureType.CATEGORICAL,
-                    chunker=self.chunker,
-                    threshold=self.thresholds[method],
-                ).fit(
-                    reference_data=reference_data[column_name],
-                    timestamps=reference_data[self.timestamp_column_name] if self.timestamp_column_name else None,
-                )
-                for method in self.categorical_method_names
-            ]
+            methods = []
+            for method in self.categorical_method_names:
+                try:
+                    methods.append(
+                        MethodFactory.create(
+                            key=method,
+                            feature_type=FeatureType.CATEGORICAL,
+                            chunker=self.chunker,
+                            threshold=self.thresholds[method],
+                        ).fit(
+                            reference_data=reference_data[column_name],
+                            timestamps=timestamps,
+                        )
+                    )
+                except Exception as ex:
+                    raise CalculatorException(f"Failed to fit method {method} for column {column_name}: {ex!r}") from ex
+            self._column_to_models_mapping[column_name] = methods
 
         self.result = self._calculate(reference_data)
         self.result.data['chunk', 'chunk', 'period'] = 'reference'

From 0cfcf9264d409ef82bfb54b63898aea22da0e89f Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Sun, 19 Nov 2023 19:18:00 +0100
Subject: [PATCH 2/7] Refactor to use single data cleaning method

---
 nannyml/base.py                               | 17 +++++++-----
 nannyml/drift/univariate/methods.py           | 24 ++++++++---------
 .../performance_calculation/metrics/base.py   | 22 ----------------
 .../metrics/binary_classification.py          | 26 +++++++++----------
 .../metrics/multiclass_classification.py      | 16 ++++++------
 .../metrics/regression.py                     | 16 ++++++------
 6 files changed, 52 insertions(+), 69 deletions(-)

diff --git a/nannyml/base.py b/nannyml/base.py
index 6ef629fb4..6e313daa6 100644
--- a/nannyml/base.py
+++ b/nannyml/base.py
@@ -533,12 +533,17 @@ def _column_is_categorical(column: pd.Series) -> bool:
     return column.dtype in ['object', 'string', 'category', 'bool']
 
 
-def _remove_missing_data(column: pd.Series):
-    if isinstance(column, pd.Series):
-        column = column.dropna().reset_index(drop=True)
-    else:
-        column = column[~np.isnan(column)]
-    return column
+def _clean_data(*data: Union[pd.Series, pd.DataFrame]) -> Tuple[pd.DataFrame, ...]:
+    """Remove rows with NaN values from the given data."""
+    mask = np.ones(len(data[0]), dtype=bool)
+    for df in data:
+        if isinstance(df, pd.DataFrame):
+            mask &= ~df.isna().all(axis=1)
+        else:
+            mask &= ~df.isna()
+
+    # NaN values have been dropped. Try to infer types again
+    return tuple(df[mask].reset_index(drop=True).infer_objects() for df in data)
 
 
 def _column_is_continuous(column: pd.Series) -> bool:
diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py
index e165964b6..73e61a1a1 100644
--- a/nannyml/drift/univariate/methods.py
+++ b/nannyml/drift/univariate/methods.py
@@ -29,7 +29,7 @@
 from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance
 
 from nannyml._typing import Self
-from nannyml.base import _column_is_categorical, _remove_missing_data
+from nannyml.base import _clean_data, _column_is_categorical
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException, NotFittedException
 from nannyml.thresholds import Threshold, calculate_threshold_values
@@ -305,7 +305,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
 
     def _calculate(self, data: pd.Series):
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         if self._treat_as_type == 'cont':
@@ -374,7 +374,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data, = _clean_data(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -389,7 +389,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -443,13 +443,13 @@ def __init__(self, **kwargs) -> None:
         self._fitted = False
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data, = _clean_data(reference_data)
         self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0]
         self._fitted = True
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -505,7 +505,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba: Optional[dict] = None
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data, = _clean_data(reference_data)
         ref_labels = reference_data.unique()
         self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}
 
@@ -516,7 +516,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         data_labels = data.unique()
@@ -574,7 +574,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data, = _clean_data(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -592,7 +592,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         if (
@@ -668,7 +668,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data = _remove_missing_data(reference_data)
+        reference_data, = _clean_data(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:
@@ -695,7 +695,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data = _remove_missing_data(data)
+        data, = _clean_data(data)
         if data.empty:
             return np.nan
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
diff --git a/nannyml/performance_calculation/metrics/base.py b/nannyml/performance_calculation/metrics/base.py
index 49da27ff0..18e2ee412 100644
--- a/nannyml/performance_calculation/metrics/base.py
+++ b/nannyml/performance_calculation/metrics/base.py
@@ -255,25 +255,3 @@ def inner_wrapper(wrapped_class: Type[Metric]) -> Type[Metric]:
             return wrapped_class
 
         return inner_wrapper
-
-
-def _common_data_cleaning(y_true: pd.Series, y_pred: Union[pd.Series, pd.DataFrame]):
-    y_true, y_pred = (
-        y_true.reset_index(drop=True),
-        y_pred.reset_index(drop=True),
-    )
-
-    if isinstance(y_pred, pd.DataFrame):
-        y_true = y_true[~y_pred.isna().all(axis=1)]
-    else:
-        y_true = y_true[~y_pred.isna()]
-    y_pred.dropna(inplace=True)
-
-    y_pred = y_pred[~y_true.isna()]
-    y_true.dropna(inplace=True)
-
-    # NaN values have been dropped. Try to infer types again
-    y_pred = y_pred.infer_objects()
-    y_true = y_true.infer_objects()
-
-    return y_true, y_pred
diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py
index 7ea66ec8e..4c210c56d 100644
--- a/nannyml/performance_calculation/metrics/binary_classification.py
+++ b/nannyml/performance_calculation/metrics/binary_classification.py
@@ -9,10 +9,10 @@
 from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
 
 from nannyml._typing import ProblemType
-from nannyml.base import _list_missing
+from nannyml.base import _clean_data, _list_missing
 from nannyml.chunk import Chunk, Chunker
 from nannyml.exceptions import InvalidArgumentsException
-from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning
+from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
 from nannyml.sampling_error.binary_classification import (
     accuracy_sampling_error,
     accuracy_sampling_error_components,
@@ -97,7 +97,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred_proba]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if y_true.nunique() <= 1:
             warnings.warn("Calculated ROC-AUC score contains NaN values.")
@@ -166,7 +166,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated F1-score contains NaN values.")
@@ -234,7 +234,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Precision score contains NaN values.")
@@ -302,7 +302,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Recall score contains NaN values.")
@@ -375,7 +375,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Specificity score contains NaN values.")
@@ -449,7 +449,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.display_name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
 
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Accuracy score contains NaN values.")
@@ -556,7 +556,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true is None:
             warnings.warn("Calculated Business Value contains NaN values.")
             return np.NaN
@@ -752,7 +752,7 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric true_positive. prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_positives contain NaN values.")
             return np.nan
@@ -781,7 +781,7 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric true_negative. prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_negatives contain NaN values.")
             return np.nan
@@ -810,7 +810,7 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric false_positive. prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_positives contain NaN values.")
             return np.nan
@@ -839,7 +839,7 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric false_negative. prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_negatives contain NaN values.")
             return np.nan
diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py
index 33ba37849..928eb2b46 100644
--- a/nannyml/performance_calculation/metrics/multiclass_classification.py
+++ b/nannyml/performance_calculation/metrics/multiclass_classification.py
@@ -24,10 +24,10 @@
 from sklearn.preprocessing import LabelBinarizer, label_binarize
 
 from nannyml._typing import ProblemType, class_labels, model_output_column_names
-from nannyml.base import _list_missing
+from nannyml.base import _clean_data, _list_missing
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException
-from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning
+from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
 from nannyml.sampling_error.multiclass_classification import (
     accuracy_sampling_error,
     accuracy_sampling_error_components,
@@ -130,7 +130,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.nunique() <= 1:
             warnings.warn("Calculated ROC-AUC score contains NaN values.")
             return np.nan
@@ -218,7 +218,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated F1-score contains NaN values.")
             return np.nan
@@ -306,7 +306,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Precision score contains NaN values.")
             return np.nan
@@ -394,7 +394,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Recall score contains NaN values.")
             return np.nan
@@ -482,7 +482,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Specificity score contains NaN values.")
             return np.nan
@@ -566,7 +566,7 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.display_name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Accuracy score contains NaN values.")
             return np.nan
diff --git a/nannyml/performance_calculation/metrics/regression.py b/nannyml/performance_calculation/metrics/regression.py
index 0c75f83f2..8b0c65399 100644
--- a/nannyml/performance_calculation/metrics/regression.py
+++ b/nannyml/performance_calculation/metrics/regression.py
@@ -13,8 +13,8 @@
 )
 
 from nannyml._typing import ProblemType
-from nannyml.base import _list_missing, _raise_exception_for_negative_values
-from nannyml.performance_calculation.metrics.base import Metric, MetricFactory, _common_data_cleaning
+from nannyml.base import _clean_data, _list_missing, _raise_exception_for_negative_values
+from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
 from nannyml.sampling_error.regression import (
     mae_sampling_error,
     mae_sampling_error_components,
@@ -80,7 +80,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -138,7 +138,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -196,7 +196,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -254,7 +254,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -317,7 +317,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -375,7 +375,7 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _common_data_cleaning(y_true, y_pred)
+        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 

From 369efbfe9ec05d915e01a3d330e9764d0571eb0a Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Sun, 19 Nov 2023 19:24:34 +0100
Subject: [PATCH 3/7] Filter NaN's when fitting JS

---
 nannyml/drift/univariate/methods.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py
index 73e61a1a1..899adec85 100644
--- a/nannyml/drift/univariate/methods.py
+++ b/nannyml/drift/univariate/methods.py
@@ -278,6 +278,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
+        reference_data, = _clean_data(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:

From a0e5b4e32740db782c6034237ca8fb0d5cd3ea2d Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Sun, 19 Nov 2023 23:09:48 +0100
Subject: [PATCH 4/7] Refactor data cleaning to accept columns argument

Previously the data cleaning method operated by accepting multiple
dataframes and inspecting each dataframe separetely for `NaN`'s.
Depending on how the data is processed after cleaning, splitting columns
into separate dataframes can be rather annoying.

To avoid that this commit changes the method to accept a single
dataframe and a columns argument. The columns argument specifies which
column subsets should be inspected for `NaN`'s, enabling the same
behaviour using a more convenient syntax.
---
 nannyml/base.py                               | 41 ++++++++++++++-----
 nannyml/drift/univariate/methods.py           | 26 ++++++------
 .../metrics/binary_classification.py          | 31 ++++++--------
 .../metrics/multiclass_classification.py      | 20 ++++-----
 .../metrics/regression.py                     | 14 +++----
 5 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/nannyml/base.py b/nannyml/base.py
index 6e313daa6..a7958ce3f 100644
--- a/nannyml/base.py
+++ b/nannyml/base.py
@@ -8,7 +8,7 @@
 import copy
 import logging
 from abc import ABC, abstractmethod
-from typing import Generic, List, Optional, Tuple, TypeVar, Union
+from typing import Generic, Iterable, List, Optional, Tuple, TypeVar, Union, overload
 
 import numpy as np
 import pandas as pd
@@ -533,17 +533,38 @@ def _column_is_categorical(column: pd.Series) -> bool:
     return column.dtype in ['object', 'string', 'category', 'bool']
 
 
-def _clean_data(*data: Union[pd.Series, pd.DataFrame]) -> Tuple[pd.DataFrame, ...]:
-    """Remove rows with NaN values from the given data."""
-    mask = np.ones(len(data[0]), dtype=bool)
-    for df in data:
-        if isinstance(df, pd.DataFrame):
-            mask &= ~df.isna().all(axis=1)
-        else:
-            mask &= ~df.isna()
+@overload
+def _remove_nans(data: pd.Series, columns: None) -> pd.Series:
+    ...
+
+@overload
+def _remove_nans(data: pd.DataFrame, columns: Optional[Iterable[Union[str, Iterable[str]]]]) -> pd.DataFrame:
+    ...
+
+
+def _remove_nans(
+    data: Union[pd.Series, pd.DataFrame], columns: Optional[Iterable[Union[str, Iterable[str]]]] = None
+) -> Tuple[pd.DataFrame, ...]:
+    """Remove rows with NaN values in the specified columns.
+
+    If no columns are given, drop rows with NaN values in any column. If columns are given, drop rows with NaN values
+    in the specified columns. If a set of columns is given, drop rows with NaN values in all of the columns in the set.
+    """
+    # If no columns are given, drop rows with NaN values in any columns
+    if columns is None:
+        mask = ~data.isna()
+        if isinstance(mask, pd.DataFrame):
+            mask = mask.all(axis=1)
+    else:
+        mask = np.ones(len(data), dtype=bool)
+        for column_selector in columns:
+            nans = data[column_selector].isna()
+            if isinstance(nans, pd.DataFrame):
+                nans = nans.all(axis=1)
+            mask &= ~nans
 
     # NaN values have been dropped. Try to infer types again
-    return tuple(df[mask].reset_index(drop=True).infer_objects() for df in data)
+    return data[mask].reset_index(drop=True).infer_objects()
 
 
 def _column_is_continuous(column: pd.Series) -> bool:
diff --git a/nannyml/drift/univariate/methods.py b/nannyml/drift/univariate/methods.py
index 899adec85..84a7d3c12 100644
--- a/nannyml/drift/univariate/methods.py
+++ b/nannyml/drift/univariate/methods.py
@@ -29,7 +29,7 @@
 from scipy.stats import chi2_contingency, ks_2samp, wasserstein_distance
 
 from nannyml._typing import Self
-from nannyml.base import _clean_data, _column_is_categorical
+from nannyml.base import _remove_nans, _column_is_categorical
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException, NotFittedException
 from nannyml.thresholds import Threshold, calculate_threshold_values
@@ -278,7 +278,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None):
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:
@@ -306,7 +306,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
 
     def _calculate(self, data: pd.Series):
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if self._treat_as_type == 'cont':
@@ -375,7 +375,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -390,7 +390,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -444,13 +444,13 @@ def __init__(self, **kwargs) -> None:
         self._fitted = False
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         self._reference_data_vcs = reference_data.value_counts().loc[lambda v: v != 0]
         self._fitted = True
         return self
 
     def _calculate(self, data: pd.Series):
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if not self._fitted:
@@ -506,7 +506,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba: Optional[dict] = None
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         ref_labels = reference_data.unique()
         self._reference_proba = {label: (reference_data == label).sum() / len(reference_data) for label in ref_labels}
 
@@ -517,7 +517,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         data_labels = data.unique()
@@ -575,7 +575,7 @@ def __init__(self, **kwargs) -> None:
             self.n_bins = kwargs['computation_params'].get('n_bins', 10_000)
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if (self.calculation_method == 'auto' and len(reference_data) < 10_000) or self.calculation_method == 'exact':
             self._reference_data = reference_data
         else:
@@ -593,7 +593,7 @@ def _calculate(self, data: pd.Series):
             raise NotFittedException(
                 "tried to call 'calculate' on an unfitted method " f"{self.display_name}. Please run 'fit' first"
             )
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         if (
@@ -669,7 +669,7 @@ def __init__(self, **kwargs) -> None:
         self._reference_proba_in_bins: np.ndarray
 
     def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None) -> Self:
-        reference_data, = _clean_data(reference_data)
+        reference_data = _remove_nans(reference_data)
         if _column_is_categorical(reference_data):
             treat_as_type = 'cat'
         else:
@@ -696,7 +696,7 @@ def _fit(self, reference_data: pd.Series, timestamps: Optional[pd.Series] = None
         return self
 
     def _calculate(self, data: pd.Series):
-        data, = _clean_data(data)
+        data = _remove_nans(data)
         if data.empty:
             return np.nan
         reference_proba_in_bins = copy(self._reference_proba_in_bins)
diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py
index 4c210c56d..e41ec382b 100644
--- a/nannyml/performance_calculation/metrics/binary_classification.py
+++ b/nannyml/performance_calculation/metrics/binary_classification.py
@@ -9,7 +9,7 @@
 from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score
 
 from nannyml._typing import ProblemType
-from nannyml.base import _clean_data, _list_missing
+from nannyml.base import _remove_nans, _list_missing
 from nannyml.chunk import Chunk, Chunker
 from nannyml.exceptions import InvalidArgumentsException
 from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
@@ -93,12 +93,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred_proba], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred_proba]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if y_true.nunique() <= 1:
             warnings.warn("Calculated ROC-AUC score contains NaN values.")
             return np.nan
@@ -162,12 +161,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated F1-score contains NaN values.")
             return np.nan
@@ -230,12 +228,11 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Precision score contains NaN values.")
             return np.nan
@@ -298,12 +295,11 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Recall score contains NaN values.")
             return np.nan
@@ -366,6 +362,7 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -375,8 +372,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Specificity score contains NaN values.")
             return np.nan
@@ -440,6 +435,7 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -449,8 +445,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.display_name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Accuracy score contains NaN values.")
             return np.nan
@@ -547,6 +541,7 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -556,7 +551,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true is None:
             warnings.warn("Calculated Business Value contains NaN values.")
             return np.NaN
@@ -743,6 +737,7 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate_true_positives(self, data: pd.DataFrame) -> float:
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -752,7 +747,6 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric true_positive. prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_positives contain NaN values.")
             return np.nan
@@ -772,6 +766,7 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float:
 
     def _calculate_true_negatives(self, data: pd.DataFrame) -> float:
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -781,7 +776,6 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric true_negative. prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_negatives contain NaN values.")
             return np.nan
@@ -801,6 +795,7 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float:
 
     def _calculate_false_positives(self, data: pd.DataFrame) -> float:
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -809,8 +804,6 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float:
             raise InvalidArgumentsException(
                 "could not calculate metric false_positive. prediction column contains no data"
             )
-
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_positives contain NaN values.")
             return np.nan
@@ -830,6 +823,7 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float:
 
     def _calculate_false_negatives(self, data: pd.DataFrame) -> float:
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -839,7 +833,6 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float:
                 "could not calculate metric false_negative. prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_negatives contain NaN values.")
             return np.nan
diff --git a/nannyml/performance_calculation/metrics/multiclass_classification.py b/nannyml/performance_calculation/metrics/multiclass_classification.py
index 928eb2b46..3fb34fe55 100644
--- a/nannyml/performance_calculation/metrics/multiclass_classification.py
+++ b/nannyml/performance_calculation/metrics/multiclass_classification.py
@@ -24,7 +24,7 @@
 from sklearn.preprocessing import LabelBinarizer, label_binarize
 
 from nannyml._typing import ProblemType, class_labels, model_output_column_names
-from nannyml.base import _clean_data, _list_missing
+from nannyml.base import _remove_nans, _list_missing
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException
 from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
@@ -116,6 +116,7 @@ def _calculate(self, data: pd.DataFrame):
             )
 
         _list_missing([self.y_true] + model_output_column_names(self.y_pred_proba), data)
+        data = _remove_nans(data, (self.y_true, self.y_pred_proba.values()))
 
         labels, class_probability_columns = [], []
         for label in sorted(list(self.y_pred_proba.keys())):
@@ -123,19 +124,18 @@ def _calculate(self, data: pd.DataFrame):
             class_probability_columns.append(self.y_pred_proba[label])
 
         y_true = data[self.y_true]
-        y_pred = data[class_probability_columns]
+        y_pred_proba = data[class_probability_columns]
 
-        if y_pred.isna().all().any():
+        if y_pred_proba.isna().all().any():
             raise InvalidArgumentsException(
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.nunique() <= 1:
             warnings.warn("Calculated ROC-AUC score contains NaN values.")
             return np.nan
         else:
-            return roc_auc_score(y_true, y_pred, multi_class='ovr', average='macro', labels=labels)
+            return roc_auc_score(y_true, y_pred_proba, multi_class='ovr', average='macro', labels=labels)
 
     def _sampling_error(self, data: pd.DataFrame) -> float:
         return auroc_sampling_error(self._sampling_error_components, data)
@@ -208,6 +208,7 @@ def _calculate(self, data: pd.DataFrame):
             )
 
         _list_missing([self.y_true, self.y_pred], data)
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         labels = sorted(list(self.y_pred_proba.keys()))
         y_true = data[self.y_true]
@@ -218,7 +219,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated F1-score contains NaN values.")
             return np.nan
@@ -296,6 +296,7 @@ def _calculate(self, data: pd.DataFrame):
             )
 
         _list_missing([self.y_true, self.y_pred], data)
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         labels = sorted(list(self.y_pred_proba.keys()))
         y_true = data[self.y_true]
@@ -306,7 +307,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Precision score contains NaN values.")
             return np.nan
@@ -384,6 +384,7 @@ def _calculate(self, data: pd.DataFrame):
             )
 
         _list_missing([self.y_true, self.y_pred], data)
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         labels = sorted(list(self.y_pred_proba.keys()))
         y_true = data[self.y_true]
@@ -394,7 +395,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Recall score contains NaN values.")
             return np.nan
@@ -472,6 +472,7 @@ def _calculate(self, data: pd.DataFrame):
             )
 
         _list_missing([self.y_true, self.y_pred], data)
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         labels = sorted(list(self.y_pred_proba.keys()))
         y_true = data[self.y_true]
@@ -482,7 +483,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric {self.display_name}: prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Specificity score contains NaN values.")
             return np.nan
@@ -557,6 +557,7 @@ def _fit(self, reference_data: pd.DataFrame):
 
     def _calculate(self, data: pd.DataFrame):
         _list_missing([self.y_true, self.y_pred], data)
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
@@ -566,7 +567,6 @@ def _calculate(self, data: pd.DataFrame):
                 f"could not calculate metric '{self.display_name}': " "prediction column contains no data"
             )
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Accuracy score contains NaN values.")
             return np.nan
diff --git a/nannyml/performance_calculation/metrics/regression.py b/nannyml/performance_calculation/metrics/regression.py
index 8b0c65399..31e9ade27 100644
--- a/nannyml/performance_calculation/metrics/regression.py
+++ b/nannyml/performance_calculation/metrics/regression.py
@@ -13,7 +13,7 @@
 )
 
 from nannyml._typing import ProblemType
-from nannyml.base import _clean_data, _list_missing, _raise_exception_for_negative_values
+from nannyml.base import _remove_nans, _list_missing, _raise_exception_for_negative_values
 from nannyml.performance_calculation.metrics.base import Metric, MetricFactory
 from nannyml.sampling_error.regression import (
     mae_sampling_error,
@@ -76,11 +76,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -134,11 +134,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -192,11 +192,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -250,11 +250,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -313,11 +313,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 
@@ -371,11 +371,11 @@ def _fit(self, reference_data: pd.DataFrame):
     def _calculate(self, data: pd.DataFrame):
         """Redefine to handle NaNs and edge cases."""
         _list_missing([self.y_true, self.y_pred], list(data.columns))
+        data = _remove_nans(data, (self.y_true, self.y_pred))
 
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        y_true, y_pred = _clean_data(y_true, y_pred)
         if y_true.empty or y_pred.empty:
             return np.nan
 

From de1677227dce6c74671f31b9882f860843b0c837 Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Sun, 19 Nov 2023 23:14:23 +0100
Subject: [PATCH 5/7] Remove errors and use warning behaviour instead

The performance calculator for binary classification had checks in place
to generate an exception if the prediction column contains nothing but
`NaN`'s. This behaviour contradicts the warning functionality that is in
the same functions that would should return `NaN` and issue a warning.
It is also inconsistent with other calculators which do issue a warning
instead of raising an error.

This commit removes the errors and relies on the existing warning
functionality.
---
 .../metrics/binary_classification.py          | 37 -------------------
 1 file changed, 37 deletions(-)

diff --git a/nannyml/performance_calculation/metrics/binary_classification.py b/nannyml/performance_calculation/metrics/binary_classification.py
index e41ec382b..f39ae5392 100644
--- a/nannyml/performance_calculation/metrics/binary_classification.py
+++ b/nannyml/performance_calculation/metrics/binary_classification.py
@@ -367,11 +367,6 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                f"could not calculate metric {self.display_name}: " "prediction column contains no data"
-            )
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Specificity score contains NaN values.")
             return np.nan
@@ -440,11 +435,6 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                f"could not calculate metric '{self.display_name}': " "prediction column contains no data"
-            )
-
         if (y_true.nunique() <= 1) or (y_pred.nunique() <= 1):
             warnings.warn("Calculated Accuracy score contains NaN values.")
             return np.nan
@@ -546,14 +536,6 @@ def _calculate(self, data: pd.DataFrame):
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                f"could not calculate metric '{self.name}': " "prediction column contains no data"
-            )
-
-        if y_true is None:
-            warnings.warn("Calculated Business Value contains NaN values.")
-            return np.NaN
         if y_true.shape[0] == 0:
             warnings.warn("Calculated Business Value contains NaN values.")
             return np.NaN
@@ -742,11 +724,6 @@ def _calculate_true_positives(self, data: pd.DataFrame) -> float:
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                "could not calculate metric true_positive. prediction column contains no data"
-            )
-
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_positives contain NaN values.")
             return np.nan
@@ -771,11 +748,6 @@ def _calculate_true_negatives(self, data: pd.DataFrame) -> float:
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                "could not calculate metric true_negative. prediction column contains no data"
-            )
-
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated true_negatives contain NaN values.")
             return np.nan
@@ -800,10 +772,6 @@ def _calculate_false_positives(self, data: pd.DataFrame) -> float:
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                "could not calculate metric false_positive. prediction column contains no data"
-            )
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_positives contain NaN values.")
             return np.nan
@@ -828,11 +796,6 @@ def _calculate_false_negatives(self, data: pd.DataFrame) -> float:
         y_true = data[self.y_true]
         y_pred = data[self.y_pred]
 
-        if y_pred.isna().all():
-            raise InvalidArgumentsException(
-                "could not calculate metric false_negative. prediction column contains no data"
-            )
-
         if y_true.empty or y_pred.empty:
             warnings.warn("Calculated false_negatives contain NaN values.")
             return np.nan

From 71314191d26750ccad6e54a978360499b58f9a15 Mon Sep 17 00:00:00 2001
From: Michael Van de Steene <michael@nannyml.com>
Date: Mon, 20 Nov 2023 00:03:16 +0100
Subject: [PATCH 6/7] Refactor more data cleaning methods

---
 .../confidence_based/metrics.py               | 28 ++++---------------
 .../direct_loss_estimation/metrics.py         | 16 ++++-------
 2 files changed, 12 insertions(+), 32 deletions(-)

diff --git a/nannyml/performance_estimation/confidence_based/metrics.py b/nannyml/performance_estimation/confidence_based/metrics.py
index 4899d8249..08e8a242f 100644
--- a/nannyml/performance_estimation/confidence_based/metrics.py
+++ b/nannyml/performance_estimation/confidence_based/metrics.py
@@ -31,6 +31,7 @@
 import nannyml.sampling_error.binary_classification as bse
 import nannyml.sampling_error.multiclass_classification as mse
 from nannyml._typing import ModelOutputsType, ProblemType, class_labels
+from nannyml.base import _remove_nans
 from nannyml.chunk import Chunk, Chunker
 from nannyml.exceptions import CalculatorException, InvalidArgumentsException
 from nannyml.performance_estimation.confidence_based import SUPPORTED_METRIC_VALUES
@@ -234,30 +235,13 @@ def _common_cleaning(
                 )
             y_pred_proba_column_name = self.y_pred_proba
 
-        clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all()
-
-        y_pred_proba = data[y_pred_proba_column_name]
-        y_pred = data[self.y_pred]
-        y_true = data[self.y_true] if clean_targets else None
-
-        # Create mask to filter out NaN values
-        mask = ~(y_pred.isna() | y_pred_proba.isna())
-        if clean_targets:
-            mask = mask | ~(y_true.isna())
+        data = _remove_nans(data, [self.y_pred, y_pred_proba_column_name])
 
-        # Drop missing values (NaN/None)
-        y_pred_proba = y_pred_proba[mask]
-        y_pred = y_pred[mask]
-        if clean_targets:
-            y_true = y_true[mask]
-
-        # NaN values have been dropped. Try to infer types again
-        y_pred_proba = y_pred_proba.infer_objects()
-        y_pred = y_pred.infer_objects()
+        clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all()
         if clean_targets:
-            y_true = y_true.infer_objects()
+            data = _remove_nans(data, [self.y_true])
 
-        return y_pred_proba, y_pred, y_true
+        return data[y_pred_proba_column_name], data[self.y_pred], (data[self.y_true] if clean_targets else None)
 
     def get_chunk_record(self, chunk_data: pd.DataFrame) -> Dict:
         """Returns a dictionary containing the performance metrics for a given chunk.
@@ -1584,7 +1568,7 @@ def _realized_performance(self, data: pd.DataFrame) -> float:
         if y_true.shape[0] == 0:
             warnings.warn("Calculated Business Value contains NaN values.")
             return np.NaN
-        
+
         tp_value = self.business_value_matrix[1, 1]
         tn_value = self.business_value_matrix[0, 0]
         fp_value = self.business_value_matrix[0, 1]
diff --git a/nannyml/performance_estimation/direct_loss_estimation/metrics.py b/nannyml/performance_estimation/direct_loss_estimation/metrics.py
index 2daf6b99c..03e35a654 100644
--- a/nannyml/performance_estimation/direct_loss_estimation/metrics.py
+++ b/nannyml/performance_estimation/direct_loss_estimation/metrics.py
@@ -29,7 +29,7 @@
 )
 
 from nannyml._typing import ProblemType
-from nannyml.base import _raise_exception_for_negative_values
+from nannyml.base import _raise_exception_for_negative_values, _remove_nans
 from nannyml.chunk import Chunk, Chunker
 from nannyml.exceptions import InvalidArgumentsException
 from nannyml.sampling_error.regression import (
@@ -271,18 +271,14 @@ def __eq__(self, other):
         """Establishes equality by comparing all properties."""
         return self.display_name == other.display_name and self.column_name == other.column_name
 
-    def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, pd.Series]:
-        clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all()
+    def _common_cleaning(self, data: pd.DataFrame) -> Tuple[pd.Series, Optional[pd.Series]]:
+        data = _remove_nans(data, [self.y_pred])
 
-        y_pred = data[self.y_pred]
+        clean_targets = self.y_true in data.columns and not data[self.y_true].isna().all()
         if clean_targets:
-            y_true = data[self.y_true]
-            y_pred = y_pred[~y_true.isna()]
-            y_true.dropna(inplace=True)
-        else:
-            y_true = None
+            data = _remove_nans(data, [self.y_pred, self.y_true])
 
-        return y_pred, y_true
+        return data[self.y_pred], (data[self.y_true] if clean_targets else None)
 
     def _train_direct_error_estimation_model(
         self,

From 59ba73ce8e26e322e349a451612191787da4d684 Mon Sep 17 00:00:00 2001
From: Niels Nuyttens <niels@nannyml.com>
Date: Mon, 20 Nov 2023 11:57:53 +0100
Subject: [PATCH 7/7] Deal with mypy overload issue

---
 nannyml/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nannyml/base.py b/nannyml/base.py
index a7958ce3f..953354dc9 100644
--- a/nannyml/base.py
+++ b/nannyml/base.py
@@ -534,7 +534,7 @@ def _column_is_categorical(column: pd.Series) -> bool:
 
 
 @overload
-def _remove_nans(data: pd.Series, columns: None) -> pd.Series:
+def _remove_nans(data: pd.Series) -> pd.Series:
     ...
 
 @overload