From bfa80ff762a3ec514307f981a2ce08d477c5e637 Mon Sep 17 00:00:00 2001
From: Daniel Huppmann <dh@dergelbesalon.at>
Date: Fri, 20 Dec 2024 13:44:30 +0100
Subject: [PATCH] Fix wildcard aggregation by skipping (#448)

* Add external repo cleanup

* Return variable name string directly

* Update usage of vars_default_args

* Add a simple test

* Implement skipping region-aggregation for wildcard variables

* Require that wildcard-variables have explicit skip-region-aggregation

* Make ruff

* Add region aggregation for wildcard aggregation test

* Fix pydantic validator

---------

Co-authored-by: Philip Hackstock <20710924+phackstock@users.noreply.github.com>
---
 nomenclature/code.py                          | 24 +++++++--
 nomenclature/codelist.py                      | 15 +++---
 nomenclature/definition.py                    |  3 ++
 nomenclature/processor/region.py              |  2 +-
 .../skip_aggregation/mappings/model_a.yaml    |  2 +-
 .../dsd/region/regions.yaml                   |  5 ++
 .../dsd/variable/variables.yaml               |  4 ++
 .../mappings/model_a.yaml                     |  8 +++
 .../mappings/model_b.yaml                     |  8 +++
 tests/test_code.py                            |  6 +++
 tests/test_codelist.py                        | 13 +++--
 tests/test_core.py                            | 49 ++++++++++++++++++-
 tests/test_validation.py                      |  6 +--
 13 files changed, 124 insertions(+), 21 deletions(-)
 create mode 100644 tests/data/region_processing/wildcard_skip_aggregation/dsd/region/regions.yaml
 create mode 100644 tests/data/region_processing/wildcard_skip_aggregation/dsd/variable/variables.yaml
 create mode 100644 tests/data/region_processing/wildcard_skip_aggregation/mappings/model_a.yaml
 create mode 100644 tests/data/region_processing/wildcard_skip_aggregation/mappings/model_b.yaml

diff --git a/nomenclature/code.py b/nomenclature/code.py
index 036ad86d..415b308f 100644
--- a/nomenclature/code.py
+++ b/nomenclature/code.py
@@ -3,19 +3,21 @@
 from keyword import iskeyword
 from pathlib import Path
 from typing import Any
+
+from pyam.utils import to_list
 from pydantic import (
-    field_validator,
-    field_serializer,
-    ConfigDict,
     BaseModel,
+    ConfigDict,
     Field,
     ValidationInfo,
+    field_serializer,
+    field_validator,
+    model_validator,
 )
+from typing_extensions import Self
 
 from nomenclature.error import ErrorCollector
 
-from pyam.utils import to_list
-
 from .countries import countries
 
 
@@ -208,6 +210,14 @@ def deserialize_json(cls, v):
     def convert_none_to_empty_string(cls, v):
         return v if v is not None else ""
 
+    @model_validator(mode="after")
+    def wildcard_must_skip_region_aggregation(self) -> Self:
+        if self.is_wildcard and self.skip_region_aggregation is False:
+            raise ValueError(
+                f"Wildcard variable '{self.name}' must skip region aggregation"
+            )
+        return self
+
     @field_validator("components", mode="before")
     def cast_variable_components_args(cls, v):
         """Cast "components" list of dicts to a codelist"""
@@ -224,6 +234,10 @@ def cast_variable_components_args(cls, v):
     def convert_str_to_none_for_writing(self, v):
         return v if v != "" else None
 
+    @property
+    def is_wildcard(self) -> bool:
+        return "*" in self.name
+
     @property
     def units(self) -> list[str]:
         return self.unit if isinstance(self.unit, list) else [self.unit]
diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py
index d9cb7039..329158e5 100644
--- a/nomenclature/codelist.py
+++ b/nomenclature/codelist.py
@@ -595,13 +595,15 @@ def check_weight_in_vars(cls, v):
             )
         return v
 
-    def vars_default_args(self, variables: list[str]) -> list[VariableCode]:
+    def vars_default_args(self, variables: list[str]) -> list[str]:
         """return subset of variables which does not feature any special pyam
         aggregation arguments and where skip_region_aggregation is False"""
         return [
-            self[var]
+            var
             for var in variables
-            if not self[var].agg_kwargs and not self[var].skip_region_aggregation
+            if var in self.keys()
+            and not self[var].agg_kwargs
+            and not self[var].skip_region_aggregation
         ]
 
     def vars_kwargs(self, variables: list[str]) -> list[VariableCode]:
@@ -610,7 +612,9 @@ def vars_kwargs(self, variables: list[str]) -> list[VariableCode]:
         return [
             self[var]
             for var in variables
-            if self[var].agg_kwargs and not self[var].skip_region_aggregation
+            if var in self.keys()
+            and self[var].agg_kwargs
+            and not self[var].skip_region_aggregation
         ]
 
     def validate_units(
@@ -621,8 +625,7 @@ def validate_units(
         if invalid_units := [
             (variable, unit, self.mapping[variable].unit)
             for variable, unit in unit_mapping.items()
-            if variable in self.variables
-            and unit not in self.mapping[variable].units
+            if variable in self.variables and unit not in self.mapping[variable].units
         ]:
             lst = [
                 f"'{v}' - expected: {'one of ' if isinstance(e, list) else ''}"
diff --git a/nomenclature/definition.py b/nomenclature/definition.py
index 9f3e639e..f0b1c2c4 100644
--- a/nomenclature/definition.py
+++ b/nomenclature/definition.py
@@ -139,6 +139,9 @@ def check_aggregate(self, df: IamDataFrame, **kwargs) -> None:
 
         with adjust_log_level(level="WARNING"):
             for code in df.variable:
+                if code not in self.variable.mapping:
+                    continue
+
                 attr = self.variable.mapping[code]
                 if attr.check_aggregate:
                     components = attr.components
diff --git a/nomenclature/processor/region.py b/nomenclature/processor/region.py
index 06b56ec9..976223d6 100644
--- a/nomenclature/processor/region.py
+++ b/nomenclature/processor/region.py
@@ -646,7 +646,7 @@ def _apply_region_processing(
 
                     # first, perform 'simple' aggregation (no arguments)
                     simple_vars = [
-                        var.name
+                        var
                         for var in self.variable_codelist.vars_default_args(
                             model_df.variable
                         )
diff --git a/tests/data/region_processing/skip_aggregation/mappings/model_a.yaml b/tests/data/region_processing/skip_aggregation/mappings/model_a.yaml
index d7e4267e..9c22510d 100644
--- a/tests/data/region_processing/skip_aggregation/mappings/model_a.yaml
+++ b/tests/data/region_processing/skip_aggregation/mappings/model_a.yaml
@@ -1,4 +1,4 @@
-model: m_a
+model: model_a
 native_regions:
   - region_A
   - region_B
diff --git a/tests/data/region_processing/wildcard_skip_aggregation/dsd/region/regions.yaml b/tests/data/region_processing/wildcard_skip_aggregation/dsd/region/regions.yaml
new file mode 100644
index 00000000..24a2ed58
--- /dev/null
+++ b/tests/data/region_processing/wildcard_skip_aggregation/dsd/region/regions.yaml
@@ -0,0 +1,5 @@
+- common:
+  - World
+- model_native:
+  - region_A
+  - region_B
diff --git a/tests/data/region_processing/wildcard_skip_aggregation/dsd/variable/variables.yaml b/tests/data/region_processing/wildcard_skip_aggregation/dsd/variable/variables.yaml
new file mode 100644
index 00000000..4f8428db
--- /dev/null
+++ b/tests/data/region_processing/wildcard_skip_aggregation/dsd/variable/variables.yaml
@@ -0,0 +1,4 @@
+- Capital Cost|Electricity|*:
+    definition: Capital cost of electricity generation for a specific technology
+    unit: USD_2010/kW
+    skip-region-aggregation: true
diff --git a/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_a.yaml b/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_a.yaml
new file mode 100644
index 00000000..9c22510d
--- /dev/null
+++ b/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_a.yaml
@@ -0,0 +1,8 @@
+model: model_a
+native_regions:
+  - region_A
+  - region_B
+common_regions:
+  - World:
+    - region_A
+    - region_B
diff --git a/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_b.yaml b/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_b.yaml
new file mode 100644
index 00000000..77c1f9c3
--- /dev/null
+++ b/tests/data/region_processing/wildcard_skip_aggregation/mappings/model_b.yaml
@@ -0,0 +1,8 @@
+model: model_b
+native_regions:
+  - region_A
+  - region_b: region_B
+common_regions:
+  - World:
+    - region_A
+    - region_b
diff --git a/tests/test_code.py b/tests/test_code.py
index 25380b76..9db67fcd 100644
--- a/tests/test_code.py
+++ b/tests/test_code.py
@@ -37,6 +37,12 @@ def test_variable_multiple_units():
     assert var.unit == ["unit1", "unit2"]
 
 
+def test_variable_wildcard_skip_region_aggregation_required():
+    """Test that a VariableCode with wildcard must have skip_region_aggregation: True"""
+    with raises(ValueError, match="Wildcard variable 'Var1\*' must skip region"):
+        VariableCode.from_dict({"Var1*": {"unit": "unit1"}})
+
+
 @pytest.mark.parametrize("unit", ["Test unit", ["Test unit 1", "Test unit 2"]])
 def test_set_attributes_with_json(unit):
     var = VariableCode(
diff --git a/tests/test_codelist.py b/tests/test_codelist.py
index 0a833c02..552161f2 100644
--- a/tests/test_codelist.py
+++ b/tests/test_codelist.py
@@ -298,9 +298,16 @@ def test_illegal_char_ignores_external_repo():
     """Check that external repos are excluded from this check."""
     # the config includes illegal characters known to be in common-definitions
     # the test will not raise errors as the check is skipped for external repos
-    DataStructureDefinition(
-        MODULE_TEST_DATA_DIR / "illegal_chars" / "char_in_external_repo" / "definitions"
-    )
+
+    try:
+        dsd = DataStructureDefinition(
+            MODULE_TEST_DATA_DIR
+            / "illegal_chars"
+            / "char_in_external_repo"
+            / "definitions"
+        )
+    finally:
+        clean_up_external_repos(dsd.config.repositories)
 
 
 def test_end_whitespace_fails():
diff --git a/tests/test_core.py b/tests/test_core.py
index ad7abaa8..ceb90093 100644
--- a/tests/test_core.py
+++ b/tests/test_core.py
@@ -259,8 +259,8 @@ def test_region_processing_weighted_aggregation(folder, exp_df, args, caplog):
 )
 def test_region_processing_skip_aggregation(model_name, region_names):
     # Testing two cases:
-    # * model "m_a" renames native regions and the world region is skipped
-    # * model "m_b" renames single constituent common regions
+    # * model "model_a" renames native regions and the world region is skipped
+    # * model "model_b" renames single constituent common regions
 
     test_df = IamDataFrame(
         pd.DataFrame(
@@ -296,6 +296,51 @@ def test_region_processing_skip_aggregation(model_name, region_names):
     assert_iamframe_equal(obs, exp)
 
 
+@pytest.mark.parametrize(
+    "model_name, region_names",
+    [("model_a", ("region_A", "region_B")), ("model_b", ("region_A", "region_b"))],
+)
+def test_region_processing_wildcard_skip_aggregation(model_name, region_names):
+    # Testing two cases:
+    # * model "model_a" keeps native regions as they are
+    # * model "model_b" renames one native region
+
+    variable = "Capital Cost|Electricity|Solar PV"
+    unit = "USD_2010/kW"
+    test_df = IamDataFrame(
+        pd.DataFrame(
+            [
+                [model_name, "s_a", region_names[0], variable, unit, 1, 2],
+                [model_name, "s_a", region_names[1], variable, unit, 3, 4],
+            ],
+            columns=IAMC_IDX + [2005, 2010],
+        )
+    )
+    add_meta(test_df)
+
+    exp = IamDataFrame(
+        pd.DataFrame(
+            [
+                [model_name, "s_a", "region_A", variable, unit, 1, 2],
+                [model_name, "s_a", "region_B", variable, unit, 3, 4],
+            ],
+            columns=IAMC_IDX + [2005, 2010],
+        )
+    )
+    add_meta(exp)
+
+    obs = process(
+        test_df,
+        dsd := DataStructureDefinition(
+            TEST_DATA_DIR / "region_processing/wildcard_skip_aggregation/dsd"
+        ),
+        processor=RegionProcessor.from_directory(
+            TEST_DATA_DIR / "region_processing/wildcard_skip_aggregation/mappings", dsd
+        ),
+    )
+    assert_iamframe_equal(obs, exp)
+
+
 @pytest.mark.parametrize(
     "input_data, exp_data, warning",
     [
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 51b9acf3..e104db38 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -70,9 +70,9 @@ def test_validation_fails_region(simple_definition, simple_df, caplog):
 def test_validation_multiple_units(extras_definition, simple_df):
     """Validating against a VariableCode with multiple units works as expected"""
     extras_definition.validate(
-        simple_df
-        .filter(variable="Primary Energy|Coal")
-        .rename(unit={"EJ/yr": "GWh/yr"})
+        simple_df.filter(variable="Primary Energy|Coal").rename(
+            unit={"EJ/yr": "GWh/yr"}
+        )
     )