NannyML · nnansters · May 27, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
@@ -365,7 +365,6 @@ def calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         """Performs a calculation on the provided data."""
         try:
             self._logger.debug(f"calculating {str(self)}")
-            data = data.copy()
             return self._calculate(data, *args, **kwargs)
         except NannyMLException:
             raise
@@ -494,7 +493,6 @@ def fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
         """Trains the calculator using reference data."""
         try:
             self._logger.info(f"fitting {str(self)}")
-            reference_data = reference_data.copy()
             return self._fit(reference_data, *args, **kwargs)
         except NannyMLException:
             raise
@@ -505,7 +503,6 @@ def estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         """Performs a calculation on the provided data."""
         try:
             self._logger.info(f"estimating {str(self)}")
-            data = data.copy()
             return self._estimate(data, *args, **kwargs)
         except NannyMLException:
             raise

@@ -154,7 +154,7 @@ class NoopCalibrator(Calibrator):
 
     def fit(self, y_pred_proba: np.ndarray, y_true: np.ndarray, *args, **kwargs):
         """Fit nothing and just return the calibrator."""
-        return self
+        pass
 
     def calibrate(self, y_pred_proba: np.ndarray, *args, **kwargs):
         """Calibrate nothing and just return the original ``y_pred_proba`` inputs."""

@@ -376,7 +376,7 @@ def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_n
 
     def _split(self, data: pd.DataFrame) -> List[Chunk]:
         def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
-            chunk_data = data.loc[index : index + chunk_size - 1, :]
+            chunk_data = data.iloc[index : index + chunk_size]
             chunk = Chunk(
                 key=f'[{index}:{index + chunk_size - 1}]',
                 data=chunk_data,
@@ -388,10 +388,9 @@ def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
                 chunk.end_datetime = pd.to_datetime(chunk.data[self.timestamp_column_name].max())
             return chunk
 
-        data = data.copy().reset_index(drop=True)
         chunks = [
             _create_chunk(index=i, data=data, chunk_size=self.chunk_size)
-            for i in range(0, len(data), self.chunk_size)
+            for i in range(0, data.shape[0], self.chunk_size)
             if i + self.chunk_size - 1 < len(data)
         ]
 

@@ -325,6 +325,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> CBPE:
         estimator: PerformanceEstimator
             The fitted estimator.
         """
+        reference_data = reference_data.copy(deep=True)
+
         if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
             return self._fit_binary(reference_data)
         elif self.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
@@ -352,6 +354,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         if data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
+        data = data.copy(deep=True)
+
         if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
             required_cols = [self.y_pred_proba]
             if self.y_pred is not None:

@@ -285,6 +285,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
         if reference_data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
+        reference_data = reference_data.copy(deep=True)
+
         _list_missing([self.y_true, self.y_pred], list(reference_data.columns))
 
         _, categorical_feature_columns = _split_features_by_type(reference_data, self.feature_column_names)
@@ -318,6 +320,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         if data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
+        data = data.copy(deep=True)
+
         _list_missing([self.y_pred], list(data.columns))
 
         _, categorical_feature_columns = _split_features_by_type(data, self.feature_column_names)

@@ -2,7 +2,7 @@
 #
 #  License: Apache Software License 2.0
 
-"""Simple Statistics Average Calculator"""
+"""Simple Statistics Average Calculator."""
 
 from typing import Any, Dict, List, Optional, Union
 
@@ -15,13 +15,12 @@
 from nannyml.exceptions import InvalidArgumentsException
 from nannyml.sampling_error import SAMPLING_ERROR_RANGE
 from nannyml.stats.avg.result import Result
-from nannyml.stats.base import _add_alert_flag
 from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values
 from nannyml.usage_logging import UsageEvent, log_usage
 
 
 class SummaryStatsAvgCalculator(AbstractCalculator):
-    """SummaryStatsAvgCalculator implementation"""
+    """SummaryStatsAvgCalculator implementation."""
 
     def __init__(
         self,
@@ -118,20 +117,6 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
         for col in self.column_names:
             self._sampling_error_components[col] = reference_data[col].std()
 
-        for column in self.column_names:
-            reference_chunk_results = np.asarray(
-                [_calculate_avg_value_stats(chunk.data[column]) for chunk in self.chunker.split(reference_data)]
-            )
-            self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
-                threshold=self.threshold,
-                data=reference_chunk_results,
-                lower_threshold_value_limit=self.lower_threshold_value_limit,
-                upper_threshold_value_limit=self.upper_threshold_value_limit,
-                logger=self._logger,
-                metric_name=self.simple_stats_metric,
-                override_using_none=True,
-            )
-
         self.result = self._calculate(data=reference_data)
         self.result.data[('chunk', 'period')] = 'reference'
 
@@ -173,6 +158,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         res = res.reset_index(drop=True)
 
         if self.result is None:
+            self._set_thresholds(results=res)
+            res = self._populate_thresholds(results=res)
             self.result = Result(
                 results_data=res,
                 column_names=self.column_names,
@@ -186,6 +173,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
             #       but this causes us to lose the "common behavior" in the top level 'filter' method when overriding.
             #       Applicable here but to many of the base classes as well (e.g. fitting and calculating)
             self.result = self.result.filter(period='reference')
+            res = self._populate_thresholds(results=res)
             self.result.data = pd.concat([self.result.data, res]).reset_index(drop=True)
 
         return self.result
@@ -198,9 +186,6 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
             result['sampling_error'] = self._sampling_error_components[column_name] / np.sqrt(data.shape[0])
             result['upper_confidence_boundary'] = result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error']
             result['lower_confidence_boundary'] = result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error']
-            result['upper_threshold'] = self._upper_alert_thresholds[column_name]
-            result['lower_threshold'] = self._lower_alert_thresholds[column_name]
-            result['alert'] = _add_alert_flag(result)
         except Exception as exc:
             if self._logger:
                 self._logger.error(
@@ -210,12 +195,34 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
             result['sampling_error'] = np.NaN
             result['upper_confidence_boundary'] = np.NaN
             result['lower_confidence_boundary'] = np.NaN
-            result['upper_threshold'] = self._upper_alert_thresholds[column_name]
-            result['lower_threshold'] = self._lower_alert_thresholds[column_name]
-            result['alert'] = np.NaN
         finally:
             return result
 
+    def _set_thresholds(self, results: pd.DataFrame):
+        for column in self.column_names:
+            self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
+                threshold=self.threshold,
+                data=results[(column, 'value')].to_numpy(),
+                lower_threshold_value_limit=self.lower_threshold_value_limit,
+                upper_threshold_value_limit=self.upper_threshold_value_limit,
+                override_using_none=True,
+                logger=self._logger,
+                metric_name=column,
+            )
+
+    def _populate_thresholds(self, results: pd.DataFrame):
+        for column in self.column_names:
+            results[(column, 'upper_threshold')] = self._upper_alert_thresholds[column]
+            results[(column, 'lower_threshold')] = self._lower_alert_thresholds[column]
+
+            lower_threshold = float('-inf') if self._lower_alert_thresholds[column] is None else self._lower_alert_thresholds[column]  # noqa: E501
+            upper_threshold = float('inf') if self._upper_alert_thresholds[column] is None else self._upper_alert_thresholds[column]  # noqa: E501
+            results[(column, 'alert')] = results.apply(
+                lambda row: not (lower_threshold < row[(column, 'value')] < upper_threshold),
+                axis=1,
+            )
+        return results
+
 
 def _create_multilevel_index(
     column_names,
@@ -230,9 +237,6 @@ def _create_multilevel_index(
             'sampling_error',
             'upper_confidence_boundary',
             'lower_confidence_boundary',
-            'upper_threshold',
-            'lower_threshold',
-            'alert',
         ]
     ]
     tuples = chunk_tuples + column_tuples

@@ -19,7 +19,6 @@
 from nannyml.base import PerColumnResult
 from nannyml.chunk import Chunker
 
-# from nannyml.exceptions import InvalidArgumentsException
 from nannyml.plots.blueprints.comparisons import ResultCompareMixin
 from nannyml.plots.blueprints.metrics import plot_metrics
 from nannyml.usage_logging import UsageEvent, log_usage
@@ -36,13 +35,15 @@ def __init__(
         timestamp_column_name: Optional[str],
         chunker: Chunker,
     ):
+        """Initalize results class."""
         super().__init__(results_data, column_names)
 
         self.timestamp_column_name = timestamp_column_name
         self.simple_stats_metric = simple_stats_metric
         self.chunker = chunker
 
     def keys(self) -> List[Key]:
+        """Get Keys."""
         return [
             Key(
                 properties=(column_name,),
@@ -57,10 +58,7 @@ def plot(
         *args,
         **kwargs,
     ) -> go.Figure:
-        """
-
-        Parameters
-        ----------
+        """Plot results.
 
         Returns
         -------
@@ -84,7 +82,6 @@ def plot(
         ...     res = res.filter(period='analysis', column_name=column_name).plot().show()
 
         """
-
         return plot_metrics(
             self,
             title='Averaged Values ',

@@ -2,25 +2,23 @@
 #
 #  License: Apache Software License 2.0
 
-"""Simple Statistics Average Calculator"""
+"""Simple Statistics Average Calculator."""
 
 from typing import Any, Dict, Optional
 
-import numpy as np
 import pandas as pd
 from pandas import MultiIndex
 
 from nannyml.base import AbstractCalculator
 from nannyml.chunk import Chunker
 from nannyml.exceptions import InvalidArgumentsException
-from nannyml.stats.base import _add_alert_flag
 from nannyml.stats.count.result import Result
 from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values
 from nannyml.usage_logging import UsageEvent, log_usage
 
 
 class SummaryStatsRowCountCalculator(AbstractCalculator):
-    """SummaryStatsRowCountCalculator implementation"""
+    """SummaryStatsRowCountCalculator implementation."""
 
     def __init__(
         self,
@@ -69,7 +67,6 @@ def __init__(
 
         self.result: Optional[Result] = None
         # No sampling error
-        # self._sampling_error_components: Dict[str, float] = {column_name: 0 for column_name in self.column_names}
         # threshold strategy is the same across all columns
         self.threshold = threshold
         self._upper_alert_threshold: Optional[float] = 0
@@ -90,19 +87,6 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
         if reference_data.empty:
             raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')
 
-        reference_chunk_results = np.asarray(
-            [self._calculate_count_value_stats(chunk.data) for chunk in self.chunker.split(reference_data)]
-        )
-        self._lower_alert_threshold, self._upper_alert_threshold = calculate_threshold_values(
-            threshold=self.threshold,
-            data=reference_chunk_results,
-            lower_threshold_value_limit=self.lower_threshold_value_limit,
-            upper_threshold_value_limit=self.upper_threshold_value_limit,
-            logger=self._logger,
-            metric_name=self.simple_stats_metric,
-            override_using_none=True,
-        )
-
         self.result = self._calculate(data=reference_data)
         self.result.data[('chunk', 'period')] = 'reference'
 
@@ -141,6 +125,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
         res = res.reset_index(drop=True)
 
         if self.result is None:
+            self._set_thresholds(results=res)
+            res = self._populate_thresholds(results=res)
             self.result = Result(
                 results_data=res,
                 simple_stats_metric=self.simple_stats_metric,
@@ -153,6 +139,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
             #       but this causes us to lose the "common behavior" in the top level 'filter' method when overriding.
             #       Applicable here but to many of the base classes as well (e.g. fitting and calculating)
             self.result = self.result.filter(period='reference')
+            res = self._populate_thresholds(results=res)
             self.result.data = pd.concat([self.result.data, res]).reset_index(drop=True)
 
         return self.result
@@ -161,17 +148,37 @@ def _calculate_for_df(self, data: pd.DataFrame) -> Dict[str, Any]:
         result = {}
         value = self._calculate_count_value_stats(data)
         result['value'] = value
-        result['upper_threshold'] = self._upper_alert_threshold
-        result['lower_threshold'] = self._lower_alert_threshold
-        result['alert'] = _add_alert_flag(result)
         return result
 
+    def _set_thresholds(self, results: pd.DataFrame):
+        self._lower_alert_threshold, self._upper_alert_threshold = calculate_threshold_values(
+            threshold=self.threshold,
+            data=results[(self.simple_stats_metric, 'value')].to_numpy(),
+            lower_threshold_value_limit=self.lower_threshold_value_limit,
+            upper_threshold_value_limit=self.upper_threshold_value_limit,
+            override_using_none=True,
+            logger=self._logger,
+            metric_name=self.simple_stats_metric,
+        )
+
+    def _populate_thresholds(self, results: pd.DataFrame):
+        results[(self.simple_stats_metric, 'upper_threshold')] = self._upper_alert_threshold
+        results[(self.simple_stats_metric, 'lower_threshold')] = self._lower_alert_threshold
+
+        lower_threshold = float('-inf') if self._lower_alert_threshold is None else self._lower_alert_threshold
+        upper_threshold = float('inf') if self._upper_alert_threshold is None else self._upper_alert_threshold
+        results[(self.simple_stats_metric, 'alert')] = results.apply(
+            lambda row: not (lower_threshold < row[(self.simple_stats_metric, 'value')] < upper_threshold),
+            axis=1,
+        )
+        return results
+
 
 def _create_multilevel_index(
     column0,
 ):
     chunk_column_names = ['key', 'chunk_index', 'start_index', 'end_index', 'start_date', 'end_date', 'period']
     chunk_tuples = [('chunk', chunk_column_name) for chunk_column_name in chunk_column_names]
-    count_tuples = [(column0, el) for el in ['value', 'upper_threshold', 'lower_threshold', 'alert']]
+    count_tuples = [(column0, el) for el in ['value', ]]
     tuples = chunk_tuples + count_tuples
     return MultiIndex.from_tuples(tuples)