Skip to content

Commit

Permalink
fix(datasets): Add code 111 for dataset description not found error (
Browse files Browse the repository at this point in the history
…#1356)

* fix(datasets): Add code `111` for dataset description not found error

* test(dataset): Test the error raised

* test: Make error tested for tighter
  • Loading branch information
eddiebergman authored Oct 14, 2024
1 parent 7acfb6a commit 891f4a6
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 104 deletions.
9 changes: 4 additions & 5 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,18 +473,17 @@ def __parse_server_exception(
code = int(server_error["oml:code"])
message = server_error["oml:message"]
additional_information = server_error.get("oml:additional_information")
if code in [372, 512, 500, 482, 542, 674]:
if code in [111, 372, 512, 500, 482, 542, 674]:
if additional_information:
full_message = f"{message} - {additional_information}"
else:
full_message = message

# 512 for runs, 372 for datasets, 500 for flows
# 482 for tasks, 542 for evaluations, 674 for setups
return OpenMLServerNoResult(
code=code,
message=full_message,
)
# 111 for dataset descriptions
return OpenMLServerNoResult(code=code, message=full_message, url=url)

# 163: failure to validate flow XML (https://www.openml.org/api_docs#!/flow/post_flow)
if code in [163] and file_elements is not None and "description" in file_elements:
# file_elements['description'] is the XML file description of the flow
Expand Down
139 changes: 40 additions & 99 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
OpenMLNotAuthorizedError,
OpenMLPrivateDatasetError,
OpenMLServerException,
OpenMLServerNoResult,
)
from openml.tasks import TaskType, create_task
from openml.testing import TestBase, create_request_response
Expand Down Expand Up @@ -274,9 +275,7 @@ def test_get_dataset_cannot_access_private_data(self):
@pytest.mark.skip("Need to find dataset name of private dataset")
def test_dataset_by_name_cannot_access_private_data(self):
openml.config.server = self.production_server
self.assertRaises(
OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE"
)
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, "NAME_GOES_HERE")

def test_get_dataset_lazy_all_functions(self):
"""Test that all expected functionality is available without downloading the dataset."""
Expand All @@ -285,9 +284,7 @@ def test_get_dataset_lazy_all_functions(self):

def ensure_absence_of_real_data():
assert not os.path.exists(
os.path.join(
openml.config.get_cache_directory(), "datasets", "1", "dataset.arff"
)
os.path.join(openml.config.get_cache_directory(), "datasets", "1", "dataset.arff")
)

tag = "test_lazy_tag_%d" % random.randint(1, 1000000)
Expand Down Expand Up @@ -509,12 +506,8 @@ def test_deletion_of_cache_dir(self):
@mock.patch("openml.datasets.functions._get_dataset_description")
def test_deletion_of_cache_dir_faulty_download(self, patch):
patch.side_effect = Exception("Boom!")
self.assertRaisesRegex(
Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1
)
datasets_cache_dir = os.path.join(
self.workdir, "org", "openml", "test", "datasets"
)
self.assertRaisesRegex(Exception, "Boom!", openml.datasets.get_dataset, dataset_id=1)
datasets_cache_dir = os.path.join(self.workdir, "org", "openml", "test", "datasets")
assert len(os.listdir(datasets_cache_dir)) == 0

def test_publish_dataset(self):
Expand Down Expand Up @@ -555,9 +548,7 @@ def test__retrieve_class_labels(self):
# Test workaround for string-typed class labels
custom_ds = openml.datasets.get_dataset(2)
custom_ds.features[31].data_type = "string"
labels = custom_ds.retrieve_class_labels(
target_name=custom_ds.features[31].name
)
labels = custom_ds.retrieve_class_labels(target_name=custom_ds.features[31].name)
assert labels == ["COIL", "SHEET"]

def test_upload_dataset_with_url(self):
Expand Down Expand Up @@ -600,9 +591,7 @@ def test_data_status(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
did = dataset.id

# admin key for test server (only adminds can activate datasets.
Expand Down Expand Up @@ -678,8 +667,7 @@ def test_attributes_arff_from_df_unknown_dtype(self):
for arr, dt in zip(data, dtype):
df = pd.DataFrame(arr)
err_msg = (
f"The dtype '{dt}' of the column '0' is not currently "
"supported by liac-arff"
f"The dtype '{dt}' of the column '0' is not currently " "supported by liac-arff"
)
with pytest.raises(ValueError, match=err_msg):
attributes_arff_from_df(df)
Expand Down Expand Up @@ -710,16 +698,12 @@ def test_create_dataset_numpy(self):

dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))

assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded arff does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"

def test_create_dataset_list(self):
data = [
Expand Down Expand Up @@ -769,15 +753,11 @@ def test_create_dataset_list(self):

dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "arff", "Wrong format for dataset"

def test_create_dataset_sparse(self):
# test the scipy.sparse.coo_matrix
Expand Down Expand Up @@ -974,9 +954,7 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
Expand All @@ -991,9 +969,7 @@ def test_create_dataset_pandas(self):
column_names = ["input1", "input2", "y"]
df = pd.DataFrame.sparse.from_spmatrix(sparse_data, columns=column_names)
# meta-information
description = (
"Synthetic dataset created from a Pandas DataFrame with Sparse columns"
)
description = "Synthetic dataset created from a Pandas DataFrame with Sparse columns"
dataset = openml.datasets.functions.create_dataset(
name=name,
description=description,
Expand All @@ -1014,15 +990,11 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
assert (
_get_online_dataset_arff(dataset.id) == dataset._dataset
), "Uploaded ARFF does not match original one"
assert (
_get_online_dataset_format(dataset.id) == "sparse_arff"
), "Wrong format for dataset"
assert _get_online_dataset_format(dataset.id) == "sparse_arff", "Wrong format for dataset"

# Check that we can overwrite the attributes
data = [["a"], ["b"], ["c"], ["d"], ["e"]]
Expand Down Expand Up @@ -1050,13 +1022,9 @@ def test_create_dataset_pandas(self):
)
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
downloaded_data = _get_online_dataset_arff(dataset.id)
assert (
downloaded_data == dataset._dataset
), "Uploaded ARFF does not match original one"
assert downloaded_data == dataset._dataset, "Uploaded ARFF does not match original one"
assert "@ATTRIBUTE rnd_str {a, b, c, d, e, f, g}" in downloaded_data

def test_ignore_attributes_dataset(self):
Expand Down Expand Up @@ -1217,9 +1185,7 @@ def test_publish_fetch_ignore_attribute(self):
# publish dataset
dataset.publish()
TestBase._mark_entity_for_removal("data", dataset.id)
TestBase.logger.info(
"collected from {}: {}".format(__file__.split("/")[-1], dataset.id)
)
TestBase.logger.info("collected from {}: {}".format(__file__.split("/")[-1], dataset.id))
# test if publish was successful
assert isinstance(dataset.id, int)

Expand Down Expand Up @@ -1403,9 +1369,7 @@ def test_get_dataset_cache_format_feather(self):
cache_dir = openml.config.get_cache_directory()
cache_dir_for_id = os.path.join(cache_dir, "datasets", "128")
feather_file = os.path.join(cache_dir_for_id, "dataset.feather")
pickle_file = os.path.join(
cache_dir_for_id, "dataset.feather.attributes.pkl.py3"
)
pickle_file = os.path.join(cache_dir_for_id, "dataset.feather.attributes.pkl.py3")
data = pd.read_feather(feather_file)
assert os.path.isfile(feather_file), "Feather file is missing"
assert os.path.isfile(pickle_file), "Attributes pickle file is missing"
Expand Down Expand Up @@ -1450,19 +1414,15 @@ def test_data_edit_critical_field(self):
# for this, we need to first clone a dataset to do changes
did = fork_dataset(1)
self._wait_for_dataset_being_processed(did)
result = edit_dataset(
did, default_target_attribute="shape", ignore_attribute="oil"
)
result = edit_dataset(did, default_target_attribute="shape", ignore_attribute="oil")
assert did == result

n_tries = 10
# we need to wait for the edit to be reflected on the server
for i in range(n_tries):
edited_dataset = openml.datasets.get_dataset(did)
try:
assert (
edited_dataset.default_target_attribute == "shape"
), edited_dataset
assert edited_dataset.default_target_attribute == "shape", edited_dataset
assert edited_dataset.ignore_attribute == ["oil"], edited_dataset
break
except AssertionError as e:
Expand All @@ -1471,9 +1431,7 @@ def test_data_edit_critical_field(self):
time.sleep(10)
# Delete the cache dir to get the newer version of the dataset
shutil.rmtree(
os.path.join(
self.workdir, "org", "openml", "test", "datasets", str(did)
),
os.path.join(self.workdir, "org", "openml", "test", "datasets", str(did)),
)

def test_data_edit_requires_field(self):
Expand Down Expand Up @@ -1564,9 +1522,7 @@ def test_list_datasets_with_high_size_parameter(self):
openml.config.server = self.production_server

datasets_a = openml.datasets.list_datasets(output_format="dataframe")
datasets_b = openml.datasets.list_datasets(
output_format="dataframe", size=np.inf
)
datasets_b = openml.datasets.list_datasets(output_format="dataframe", size=np.inf)

# Reverting to test server
openml.config.server = self.test_server
Expand Down Expand Up @@ -1646,9 +1602,7 @@ def test_invalid_attribute_validations(
(None, None, ["outlook", "windy"]),
],
)
def test_valid_attribute_validations(
default_target_attribute, row_id_attribute, ignore_attribute
):
def test_valid_attribute_validations(default_target_attribute, row_id_attribute, ignore_attribute):
data = [
["a", "sunny", 85.0, 85.0, "FALSE", "no"],
["b", "sunny", 80.0, 90.0, "TRUE", "no"],
Expand Down Expand Up @@ -1749,10 +1703,7 @@ def test_delete_dataset(self):
def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_not_owned.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_owned.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand All @@ -1774,10 +1725,7 @@ def test_delete_dataset_not_owned(mock_delete, test_files_directory, test_api_ke
def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_has_tasks.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_has_tasks.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand All @@ -1799,10 +1747,7 @@ def test_delete_dataset_with_run(mock_delete, test_files_directory, test_api_key
def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_successful.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_successful.xml"
)
mock_delete.return_value = create_request_response(
status_code=200,
Expand All @@ -1821,10 +1766,7 @@ def test_delete_dataset_success(mock_delete, test_files_directory, test_api_key)
def test_delete_unknown_dataset(mock_delete, test_files_directory, test_api_key):
openml.config.start_using_configuration_for_example()
content_file = (
test_files_directory
/ "mock_responses"
/ "datasets"
/ "data_delete_not_exist.xml"
test_files_directory / "mock_responses" / "datasets" / "data_delete_not_exist.xml"
)
mock_delete.return_value = create_request_response(
status_code=412,
Expand Down Expand Up @@ -1861,9 +1803,7 @@ def test_list_datasets(all_datasets: pd.DataFrame):


def test_list_datasets_by_tag(all_datasets: pd.DataFrame):
tag_datasets = openml.datasets.list_datasets(
tag="study_14", output_format="dataframe"
)
tag_datasets = openml.datasets.list_datasets(tag="study_14", output_format="dataframe")
assert 0 < len(tag_datasets) < len(all_datasets)
_assert_datasets_have_id_and_valid_status(tag_datasets)

Expand Down Expand Up @@ -2001,15 +1941,16 @@ def test_get_dataset_lazy_behavior(
with_features=with_features,
with_data=with_data,
)
assert (
dataset.features
), "Features should be downloaded on-demand if not during get_dataset"
assert (
dataset.qualities
), "Qualities should be downloaded on-demand if not during get_dataset"
assert (
dataset.get_data()
), "Data should be downloaded on-demand if not during get_dataset"
assert dataset.features, "Features should be downloaded on-demand if not during get_dataset"
assert dataset.qualities, "Qualities should be downloaded on-demand if not during get_dataset"
assert dataset.get_data(), "Data should be downloaded on-demand if not during get_dataset"
_assert_datasets_retrieved_successfully(
[1], with_qualities=True, with_features=True, with_data=True
)


def test_get_dataset_with_invalid_id() -> None:
INVALID_ID = 123819023109238 # Well, at some point this will probably be valid...
with pytest.raises(OpenMLServerNoResult, match="Unknown dataset") as e:
openml.datasets.get_dataset(INVALID_ID)
assert e.value.code == 111

0 comments on commit 891f4a6

Please sign in to comment.