Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor Statistics submodule #390

Merged
merged 15 commits into from
May 27, 2024
3 changes: 0 additions & 3 deletions nannyml/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ def calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
"""Performs a calculation on the provided data."""
try:
self._logger.debug(f"calculating {str(self)}")
data = data.copy()
return self._calculate(data, *args, **kwargs)
except NannyMLException:
raise
Expand Down Expand Up @@ -494,7 +493,6 @@ def fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
"""Trains the calculator using reference data."""
try:
self._logger.info(f"fitting {str(self)}")
reference_data = reference_data.copy()
return self._fit(reference_data, *args, **kwargs)
except NannyMLException:
raise
Expand All @@ -505,7 +503,6 @@ def estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
"""Performs a calculation on the provided data."""
try:
self._logger.info(f"estimating {str(self)}")
data = data.copy()
return self._estimate(data, *args, **kwargs)
except NannyMLException:
raise
Expand Down
2 changes: 1 addition & 1 deletion nannyml/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ class NoopCalibrator(Calibrator):

def fit(self, y_pred_proba: np.ndarray, y_true: np.ndarray, *args, **kwargs):
"""Fit nothing and just return the calibrator."""
return self
pass

def calibrate(self, y_pred_proba: np.ndarray, *args, **kwargs):
"""Calibrate nothing and just return the original ``y_pred_proba`` inputs."""
Expand Down
5 changes: 2 additions & 3 deletions nannyml/chunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ def __init__(self, chunk_size: int, incomplete: str = 'keep', timestamp_column_n

def _split(self, data: pd.DataFrame) -> List[Chunk]:
def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
chunk_data = data.loc[index : index + chunk_size - 1, :]
chunk_data = data.iloc[index : index + chunk_size]
chunk = Chunk(
key=f'[{index}:{index + chunk_size - 1}]',
data=chunk_data,
Expand All @@ -388,10 +388,9 @@ def _create_chunk(index: int, data: pd.DataFrame, chunk_size: int) -> Chunk:
chunk.end_datetime = pd.to_datetime(chunk.data[self.timestamp_column_name].max())
return chunk

data = data.copy().reset_index(drop=True)
chunks = [
_create_chunk(index=i, data=data, chunk_size=self.chunk_size)
for i in range(0, len(data), self.chunk_size)
for i in range(0, data.shape[0], self.chunk_size)
if i + self.chunk_size - 1 < len(data)
]

Expand Down
4 changes: 4 additions & 0 deletions nannyml/performance_estimation/confidence_based/cbpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> CBPE:
estimator: PerformanceEstimator
The fitted estimator.
"""
reference_data = reference_data.copy(deep=True)

if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
return self._fit_binary(reference_data)
elif self.problem_type == ProblemType.CLASSIFICATION_MULTICLASS:
Expand Down Expand Up @@ -352,6 +354,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
if data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

data = data.copy(deep=True)

if self.problem_type == ProblemType.CLASSIFICATION_BINARY:
required_cols = [self.y_pred_proba]
if self.y_pred is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs) -> Self:
if reference_data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

reference_data = reference_data.copy(deep=True)

_list_missing([self.y_true, self.y_pred], list(reference_data.columns))

_, categorical_feature_columns = _split_features_by_type(reference_data, self.feature_column_names)
Expand Down Expand Up @@ -318,6 +320,8 @@ def _estimate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
if data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

data = data.copy(deep=True)

_list_missing([self.y_pred], list(data.columns))

_, categorical_feature_columns = _split_features_by_type(data, self.feature_column_names)
Expand Down
56 changes: 30 additions & 26 deletions nannyml/stats/avg/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# License: Apache Software License 2.0

"""Simple Statistics Average Calculator"""
"""Simple Statistics Average Calculator."""

from typing import Any, Dict, List, Optional, Union

Expand All @@ -15,13 +15,12 @@
from nannyml.exceptions import InvalidArgumentsException
from nannyml.sampling_error import SAMPLING_ERROR_RANGE
from nannyml.stats.avg.result import Result
from nannyml.stats.base import _add_alert_flag
from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values
from nannyml.usage_logging import UsageEvent, log_usage


class SummaryStatsAvgCalculator(AbstractCalculator):
"""SummaryStatsAvgCalculator implementation"""
"""SummaryStatsAvgCalculator implementation."""

def __init__(
self,
Expand Down Expand Up @@ -118,20 +117,6 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
for col in self.column_names:
self._sampling_error_components[col] = reference_data[col].std()

for column in self.column_names:
reference_chunk_results = np.asarray(
[_calculate_avg_value_stats(chunk.data[column]) for chunk in self.chunker.split(reference_data)]
)
self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
threshold=self.threshold,
data=reference_chunk_results,
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
logger=self._logger,
metric_name=self.simple_stats_metric,
override_using_none=True,
)

self.result = self._calculate(data=reference_data)
self.result.data[('chunk', 'period')] = 'reference'

Expand Down Expand Up @@ -173,6 +158,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
res = res.reset_index(drop=True)

if self.result is None:
self._set_thresholds(results=res)
res = self._populate_thresholds(results=res)
self.result = Result(
results_data=res,
column_names=self.column_names,
Expand All @@ -186,6 +173,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
# but this causes us to lose the "common behavior" in the top level 'filter' method when overriding.
# Applicable here but to many of the base classes as well (e.g. fitting and calculating)
self.result = self.result.filter(period='reference')
res = self._populate_thresholds(results=res)
self.result.data = pd.concat([self.result.data, res]).reset_index(drop=True)

return self.result
Expand All @@ -198,9 +186,6 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
result['sampling_error'] = self._sampling_error_components[column_name] / np.sqrt(data.shape[0])
result['upper_confidence_boundary'] = result['value'] + SAMPLING_ERROR_RANGE * result['sampling_error']
result['lower_confidence_boundary'] = result['value'] - SAMPLING_ERROR_RANGE * result['sampling_error']
result['upper_threshold'] = self._upper_alert_thresholds[column_name]
result['lower_threshold'] = self._lower_alert_thresholds[column_name]
result['alert'] = _add_alert_flag(result)
except Exception as exc:
if self._logger:
self._logger.error(
Expand All @@ -210,12 +195,34 @@ def _calculate_for_column(self, data: pd.DataFrame, column_name: str) -> Dict[st
result['sampling_error'] = np.NaN
result['upper_confidence_boundary'] = np.NaN
result['lower_confidence_boundary'] = np.NaN
result['upper_threshold'] = self._upper_alert_thresholds[column_name]
result['lower_threshold'] = self._lower_alert_thresholds[column_name]
result['alert'] = np.NaN
finally:
return result

def _set_thresholds(self, results: pd.DataFrame):
for column in self.column_names:
self._lower_alert_thresholds[column], self._upper_alert_thresholds[column] = calculate_threshold_values(
threshold=self.threshold,
data=results[(column, 'value')].to_numpy(),
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
override_using_none=True,
logger=self._logger,
metric_name=column,
)

def _populate_thresholds(self, results: pd.DataFrame):
for column in self.column_names:
results[(column, 'upper_threshold')] = self._upper_alert_thresholds[column]
results[(column, 'lower_threshold')] = self._lower_alert_thresholds[column]

lower_threshold = float('-inf') if self._lower_alert_thresholds[column] is None else self._lower_alert_thresholds[column] # noqa: E501
upper_threshold = float('inf') if self._upper_alert_thresholds[column] is None else self._upper_alert_thresholds[column] # noqa: E501
results[(column, 'alert')] = results.apply(
lambda row: not (lower_threshold < row[(column, 'value')] < upper_threshold),
axis=1,
)
return results


def _create_multilevel_index(
column_names,
Expand All @@ -230,9 +237,6 @@ def _create_multilevel_index(
'sampling_error',
'upper_confidence_boundary',
'lower_confidence_boundary',
'upper_threshold',
'lower_threshold',
'alert',
]
]
tuples = chunk_tuples + column_tuples
Expand Down
9 changes: 3 additions & 6 deletions nannyml/stats/avg/result.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from nannyml.base import PerColumnResult
from nannyml.chunk import Chunker

# from nannyml.exceptions import InvalidArgumentsException
from nannyml.plots.blueprints.comparisons import ResultCompareMixin
from nannyml.plots.blueprints.metrics import plot_metrics
from nannyml.usage_logging import UsageEvent, log_usage
Expand All @@ -36,13 +35,15 @@ def __init__(
timestamp_column_name: Optional[str],
chunker: Chunker,
):
"""Initalize results class."""
super().__init__(results_data, column_names)

self.timestamp_column_name = timestamp_column_name
self.simple_stats_metric = simple_stats_metric
self.chunker = chunker

def keys(self) -> List[Key]:
"""Get Keys."""
return [
Key(
properties=(column_name,),
Expand All @@ -57,10 +58,7 @@ def plot(
*args,
**kwargs,
) -> go.Figure:
"""

Parameters
----------
"""Plot results.

Returns
-------
Expand All @@ -84,7 +82,6 @@ def plot(
... res = res.filter(period='analysis', column_name=column_name).plot().show()

"""

return plot_metrics(
self,
title='Averaged Values ',
Expand Down
21 changes: 0 additions & 21 deletions nannyml/stats/base.py

This file was deleted.

51 changes: 29 additions & 22 deletions nannyml/stats/count/calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,23 @@
#
# License: Apache Software License 2.0

"""Simple Statistics Average Calculator"""
"""Simple Statistics Average Calculator."""

from typing import Any, Dict, Optional

import numpy as np
import pandas as pd
from pandas import MultiIndex

from nannyml.base import AbstractCalculator
from nannyml.chunk import Chunker
from nannyml.exceptions import InvalidArgumentsException
from nannyml.stats.base import _add_alert_flag
from nannyml.stats.count.result import Result
from nannyml.thresholds import StandardDeviationThreshold, Threshold, calculate_threshold_values
from nannyml.usage_logging import UsageEvent, log_usage


class SummaryStatsRowCountCalculator(AbstractCalculator):
"""SummaryStatsRowCountCalculator implementation"""
"""SummaryStatsRowCountCalculator implementation."""

def __init__(
self,
Expand Down Expand Up @@ -69,7 +67,6 @@ def __init__(

self.result: Optional[Result] = None
# No sampling error
# self._sampling_error_components: Dict[str, float] = {column_name: 0 for column_name in self.column_names}
# threshold strategy is the same across all columns
self.threshold = threshold
self._upper_alert_threshold: Optional[float] = 0
Expand All @@ -90,19 +87,6 @@ def _fit(self, reference_data: pd.DataFrame, *args, **kwargs):
if reference_data.empty:
raise InvalidArgumentsException('data contains no rows. Please provide a valid data set.')

reference_chunk_results = np.asarray(
[self._calculate_count_value_stats(chunk.data) for chunk in self.chunker.split(reference_data)]
)
self._lower_alert_threshold, self._upper_alert_threshold = calculate_threshold_values(
threshold=self.threshold,
data=reference_chunk_results,
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
logger=self._logger,
metric_name=self.simple_stats_metric,
override_using_none=True,
)

self.result = self._calculate(data=reference_data)
self.result.data[('chunk', 'period')] = 'reference'

Expand Down Expand Up @@ -141,6 +125,8 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
res = res.reset_index(drop=True)

if self.result is None:
self._set_thresholds(results=res)
res = self._populate_thresholds(results=res)
self.result = Result(
results_data=res,
simple_stats_metric=self.simple_stats_metric,
Expand All @@ -153,6 +139,7 @@ def _calculate(self, data: pd.DataFrame, *args, **kwargs) -> Result:
# but this causes us to lose the "common behavior" in the top level 'filter' method when overriding.
# Applicable here but to many of the base classes as well (e.g. fitting and calculating)
self.result = self.result.filter(period='reference')
res = self._populate_thresholds(results=res)
self.result.data = pd.concat([self.result.data, res]).reset_index(drop=True)

return self.result
Expand All @@ -161,17 +148,37 @@ def _calculate_for_df(self, data: pd.DataFrame) -> Dict[str, Any]:
result = {}
value = self._calculate_count_value_stats(data)
result['value'] = value
result['upper_threshold'] = self._upper_alert_threshold
result['lower_threshold'] = self._lower_alert_threshold
result['alert'] = _add_alert_flag(result)
return result

def _set_thresholds(self, results: pd.DataFrame):
self._lower_alert_threshold, self._upper_alert_threshold = calculate_threshold_values(
threshold=self.threshold,
data=results[(self.simple_stats_metric, 'value')].to_numpy(),
lower_threshold_value_limit=self.lower_threshold_value_limit,
upper_threshold_value_limit=self.upper_threshold_value_limit,
override_using_none=True,
logger=self._logger,
metric_name=self.simple_stats_metric,
)

def _populate_thresholds(self, results: pd.DataFrame):
results[(self.simple_stats_metric, 'upper_threshold')] = self._upper_alert_threshold
results[(self.simple_stats_metric, 'lower_threshold')] = self._lower_alert_threshold

lower_threshold = float('-inf') if self._lower_alert_threshold is None else self._lower_alert_threshold
upper_threshold = float('inf') if self._upper_alert_threshold is None else self._upper_alert_threshold
results[(self.simple_stats_metric, 'alert')] = results.apply(
lambda row: not (lower_threshold < row[(self.simple_stats_metric, 'value')] < upper_threshold),
axis=1,
)
return results


def _create_multilevel_index(
column0,
):
chunk_column_names = ['key', 'chunk_index', 'start_index', 'end_index', 'start_date', 'end_date', 'period']
chunk_tuples = [('chunk', chunk_column_name) for chunk_column_name in chunk_column_names]
count_tuples = [(column0, el) for el in ['value', 'upper_threshold', 'lower_threshold', 'alert']]
count_tuples = [(column0, el) for el in ['value', ]]
tuples = chunk_tuples + count_tuples
return MultiIndex.from_tuples(tuples)
Loading
Loading