From 246ca356bc5127636001e54745eef032acf12ecb Mon Sep 17 00:00:00 2001 From: Michael Van de Steene Date: Fri, 24 May 2024 21:58:57 +0200 Subject: [PATCH] Optimize filtering univariate result for period The `PerMetricPerColumnResult` filter function has a high overhead for selecting a subset of columns or metrics. This overhead is also incurred (and highest) when only filtering for period, as then all columns & metrics will be selected. This commit adds a short-circuit path to avoid the overhead when only the period requires filtering. For a result with 50 columns and 8 metrics this results in a >100x speed-up when only filtering for period. --- nannyml/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nannyml/base.py b/nannyml/base.py index 9cc61326..a0ec7062 100644 --- a/nannyml/base.py +++ b/nannyml/base.py @@ -293,13 +293,15 @@ def _filter( *args, **kwargs, ) -> Self: + res = super()._filter(period, *args, **kwargs) + if metrics is None and column_names is None: + return res + if metrics is None: metrics = [metric.column_name for metric in self.metrics] if column_names is None: column_names = self.column_names - res = super()._filter(period, *args, **kwargs) - data = pd.concat([res.data.loc[:, (['chunk'])], res.data.loc[:, (column_names, metrics)]], axis=1) data = data.reset_index(drop=True)