fix: avoid stripping whitespaces for feature names (#1368)

* fix: minimal invasive change to avoid stripping whitespaces for feature names Co-authored-by: amastruserio <amastruserio@users.noreply.github.com> * fix: roll back change to work with older and newer xmltodict versions * add: test for whitespaces in features xml --------- Co-authored-by: amastruserio <amastruserio@users.noreply.github.com>
openml · Oct 16, 2024 · d0deb6d · d0deb6d
1 parent aa0aca0
commit d0deb6d
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 8 deletions.
diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -1077,7 +1077,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:
 
 
 def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
-    xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
+    xml_dict = xmltodict.parse(
+        features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
+    )
     features_xml = xml_dict["oml:data_features"]
 
     features: dict[int, OpenMLDataFeature] = {}

diff --git a/openml/study/functions.py b/openml/study/functions.py
@@ -78,7 +78,7 @@ def get_study(
     return study
 
 
-def _get_study(id_: int | str, entity_type: str) -> BaseStudy:  # noqa: C901
+def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
     xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
     force_list_tags = (
         "oml:data_id",
@@ -93,12 +93,6 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy:  # noqa: C901
     alias = result_dict.get("oml:alias", None)
     main_entity_type = result_dict["oml:main_entity_type"]
 
-    # Parses edge cases where the server returns a string with a newline character for empty values.
-    none_value_indicator = "\n      "
-    for key in result_dict:
-        if result_dict[key] == none_value_indicator:
-            result_dict[key] = None
-
     if entity_type != main_entity_type:
         raise ValueError(
             f"Unexpected entity type '{main_entity_type}' reported by the server"

diff --git a/tests/files/misc/features_with_whitespaces.xml b/tests/files/misc/features_with_whitespaces.xml
@@ -0,0 +1,22 @@
+<oml:data_features xmlns:oml="http://openml.org/openml">
+    <oml:feature>
+        <oml:index>0</oml:index>
+        <oml:name>V1</oml:name>
+        <oml:data_type>numeric</oml:data_type>
+            <oml:is_target>false</oml:is_target>
+        <oml:is_ignore>false</oml:is_ignore>
+        <oml:is_row_identifier>false</oml:is_row_identifier>
+        <oml:number_of_missing_values>0</oml:number_of_missing_values>
+    </oml:feature>
+    <oml:feature>
+        <oml:index>1</oml:index>
+        <oml:name>V42</oml:name>
+        <oml:data_type>nominal</oml:data_type>
+              <oml:nominal_value> - 50000.</oml:nominal_value>
+              <oml:nominal_value> 50000+.</oml:nominal_value>
+            <oml:is_target>false</oml:is_target>
+        <oml:is_ignore>false</oml:is_ignore>
+        <oml:is_row_identifier>false</oml:is_row_identifier>
+        <oml:number_of_missing_values>0</oml:number_of_missing_values>
+    </oml:feature>
+</oml:data_features>
diff --git a/tests/test_datasets/test_dataset_functions.py b/tests/test_datasets/test_dataset_functions.py
@@ -1954,3 +1954,9 @@ def test_get_dataset_with_invalid_id() -> None:
     with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
         openml.datasets.get_dataset(INVALID_ID)
         assert e.value.code == 111
+
+def test_read_features_from_xml_with_whitespace() -> None:
+    from openml.datasets.dataset import _read_features
+    features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
+    dict = _read_features(features_file)
+    assert dict[1].nominal_values == [" - 50000.", " 50000+."]