Skip to content

Commit

Permalink
fix: avoid stripping whitespaces for feature names (#1368)
Browse files Browse the repository at this point in the history
* fix: minimal invasive change to avoid stripping whitespaces for feature names

Co-authored-by: amastruserio <amastruserio@users.noreply.github.com>

* fix: roll back change to work with older and newer xmltodict versions

* add: test for whitespaces in features xml

---------

Co-authored-by: amastruserio <amastruserio@users.noreply.github.com>
  • Loading branch information
LennartPurucker and amastruserio authored Oct 16, 2024
1 parent aa0aca0 commit d0deb6d
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 8 deletions.
4 changes: 3 additions & 1 deletion openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,7 +1077,9 @@ def _read_features(features_file: Path) -> dict[int, OpenMLDataFeature]:


def _parse_features_xml(features_xml_string: str) -> dict[int, OpenMLDataFeature]:
xml_dict = xmltodict.parse(features_xml_string, force_list=("oml:feature", "oml:nominal_value"))
xml_dict = xmltodict.parse(
features_xml_string, force_list=("oml:feature", "oml:nominal_value"), strip_whitespace=False
)
features_xml = xml_dict["oml:data_features"]

features: dict[int, OpenMLDataFeature] = {}
Expand Down
8 changes: 1 addition & 7 deletions openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def get_study(
return study


def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
force_list_tags = (
"oml:data_id",
Expand All @@ -93,12 +93,6 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
alias = result_dict.get("oml:alias", None)
main_entity_type = result_dict["oml:main_entity_type"]

# Parses edge cases where the server returns a string with a newline character for empty values.
none_value_indicator = "\n "
for key in result_dict:
if result_dict[key] == none_value_indicator:
result_dict[key] = None

if entity_type != main_entity_type:
raise ValueError(
f"Unexpected entity type '{main_entity_type}' reported by the server"
Expand Down
22 changes: 22 additions & 0 deletions tests/files/misc/features_with_whitespaces.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<oml:data_features xmlns:oml="http://openml.org/openml">
<oml:feature>
<oml:index>0</oml:index>
<oml:name>V1</oml:name>
<oml:data_type>numeric</oml:data_type>
<oml:is_target>false</oml:is_target>
<oml:is_ignore>false</oml:is_ignore>
<oml:is_row_identifier>false</oml:is_row_identifier>
<oml:number_of_missing_values>0</oml:number_of_missing_values>
</oml:feature>
<oml:feature>
<oml:index>1</oml:index>
<oml:name>V42</oml:name>
<oml:data_type>nominal</oml:data_type>
<oml:nominal_value> - 50000.</oml:nominal_value>
<oml:nominal_value> 50000+.</oml:nominal_value>
<oml:is_target>false</oml:is_target>
<oml:is_ignore>false</oml:is_ignore>
<oml:is_row_identifier>false</oml:is_row_identifier>
<oml:number_of_missing_values>0</oml:number_of_missing_values>
</oml:feature>
</oml:data_features>
6 changes: 6 additions & 0 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1954,3 +1954,9 @@ def test_get_dataset_with_invalid_id() -> None:
with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
openml.datasets.get_dataset(INVALID_ID)
assert e.value.code == 111

def test_read_features_from_xml_with_whitespace() -> None:
from openml.datasets.dataset import _read_features
features_file = Path(__file__).parent.parent / "files" / "misc" / "features_with_whitespaces.xml"
dict = _read_features(features_file)
assert dict[1].nominal_values == [" - 50000.", " 50000+."]

0 comments on commit d0deb6d

Please sign in to comment.