From ca2973cb077bc7b630aba3c88d12ffa1ecc4c85e Mon Sep 17 00:00:00 2001 From: Tom van der Weide Date: Mon, 18 Nov 2024 03:14:56 -0800 Subject: [PATCH] Restructure logic to minimize the number of file system accesses This also introduces a method that uses a glob to find all version folders instead of listing everything in a dir and then doing is_dir on all of them. PiperOrigin-RevId: 697562330 --- .../conll/conll_dataset_builder_test.py | 104 +++---- .../conll/conllu_dataset_builder_test.py | 270 +++++++++--------- .../core/folder_dataset/image_folder.py | 4 +- .../core/folder_dataset/translate_folder.py | 2 +- tensorflow_datasets/core/read_only_builder.py | 5 +- tensorflow_datasets/core/utils/file_utils.py | 88 ++++-- .../core/utils/file_utils_test.py | 29 ++ 7 files changed, 271 insertions(+), 231 deletions(-) diff --git a/tensorflow_datasets/core/dataset_builders/conll/conll_dataset_builder_test.py b/tensorflow_datasets/core/dataset_builders/conll/conll_dataset_builder_test.py index 00be1640ec0..9bcf90661ea 100644 --- a/tensorflow_datasets/core/dataset_builders/conll/conll_dataset_builder_test.py +++ b/tensorflow_datasets/core/dataset_builders/conll/conll_dataset_builder_test.py @@ -13,9 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for conll_dataset_builder.""" import textwrap -from unittest import mock from etils import epath import pytest @@ -25,28 +23,22 @@ _FOLDER_PATH = "mock/path" -_VALID_INPUT = textwrap.dedent( - """ +_VALID_INPUT = textwrap.dedent(""" -DOCSTART- -X- -X- O Winter NN B-NP O is VBZ B-VP O Air NN I-NP O . . O O -""" -) +""") -_INVALID_INPUT = textwrap.dedent( - """ +_INVALID_INPUT = textwrap.dedent(""" Winter NN B-NP is VBZ B-VP O Air NN I-NP O . . O O -""" -) - -_INPUT_PATH = epath.Path(_FOLDER_PATH, "input_path.txt") +""") class DummyConllDataset(conll_dataset_builder.ConllDatasetBuilder): @@ -63,53 +55,51 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" del dl_manager - return {"train": self._generate_examples(_INPUT_PATH)} - - -def test_generate_example(): - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT - expected_examples = [] - - dataset = DummyConllDataset() - - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_VALID_INPUT) - examples = list(dataset._generate_examples(_INPUT_PATH)) - - expected_examples = [ - ( - 0, - { - "tokens": ["Winter", "is"], - "pos": ["NN", "VBZ"], - "chunks": ["B-NP", "B-VP"], - "ner": ["O", "O"], - }, - ), - ( - 1, - { - "tokens": ["Air", "."], - "pos": ["NN", "."], - "chunks": ["I-NP", "O"], - "ner": ["O", "O"], - }, - ), - ] - - assert examples == expected_examples - - for _, example in examples: - assert len(example) == len(conll_lib.CONLL_2003_ORDERED_FEATURES) + return {"train": self._generate_examples("/tmp/input.txt")} + + +def test_generate_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input_path.txt" + input_path.write_text(_VALID_INPUT) + + dataset = DummyConllDataset(data_dir=tmpdir) + examples = list(dataset._generate_examples(input_path)) + + expected_examples = [ + ( + 0, + { + "tokens": ["Winter", "is"], + "pos": ["NN", "VBZ"], + "chunks": ["B-NP", "B-VP"], + "ner": ["O", "O"], + }, + ), + ( + 1, + { + "tokens": ["Air", "."], + "pos": ["NN", "."], + "chunks": ["I-NP", "O"], + "ner": ["O", "O"], + }, + ), + ] + + assert examples == expected_examples + + for _, example in examples: + assert len(example) == len(conll_lib.CONLL_2003_ORDERED_FEATURES) assert len(examples) == 2 -def test_generate_corrupted_example(): - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT - dataset = DummyConllDataset() +def test_generate_corrupted_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input_path.txt" + input_path.write_text(_INVALID_INPUT) + dataset = DummyConllDataset(data_dir=tmpdir) error_line = "Winter NN B-NP" error_msg = ( @@ -117,6 +107,4 @@ def test_generate_corrupted_example(): "Should be 4, but found 3" ) with pytest.raises(ValueError, match=error_msg): - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_INVALID_INPUT) - list(dataset._generate_examples(_INPUT_PATH)) + list(dataset._generate_examples(input_path)) diff --git a/tensorflow_datasets/core/dataset_builders/conll/conllu_dataset_builder_test.py b/tensorflow_datasets/core/dataset_builders/conll/conllu_dataset_builder_test.py index 87d04e86727..81840563941 100644 --- a/tensorflow_datasets/core/dataset_builders/conll/conllu_dataset_builder_test.py +++ b/tensorflow_datasets/core/dataset_builders/conll/conllu_dataset_builder_test.py @@ -13,9 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -"""Tests for conllu_dataset_builder.""" import textwrap -from unittest import mock from etils import epath import pytest @@ -26,8 +24,7 @@ _FOLDER_PATH = "mock/path" -_VALID_INPUT = textwrap.dedent( - """ +_VALID_INPUT = textwrap.dedent(""" # sent_id = VIT-9558 # text = Il futuro. 1 Il il DET RD Definite=Def|Gender=Masc|Number=Sing|PronType=Art 2 det _ _ @@ -42,21 +39,16 @@ 3 di di ADP E _ 6 case _ _ 4 l' il DET RD Definite=Def|Number=Sing|PronType=Art 6 det _ _ 5 ambiente ambiente NOUN S Gender=Masc|Number=Sing 3 nmod _ _ -""" -) +""") # The error making this invalid is the missing lemma field in line 1. -_INVALID_INPUT = textwrap.dedent( - """ +_INVALID_INPUT = textwrap.dedent(""" # sent_id = VIT-9558 # text = Il futuro. 1 Il DET RD Definite=Def|Gender=Masc|Number=Sing|PronType=Art 2 det _ _ 2 futuro futuro NOUN S Gender=Masc|Number=Sing 0 root _ SpaceAfter=No 3 . . PUNCT FS _ 2 punct _ _ -""" -) - -_INPUT_PATH = epath.Path(_FOLDER_PATH, "input_path.txt") +""") class DummyConllUDataset(conllu_dataset_builder.ConllUDatasetBuilder): @@ -78,100 +70,97 @@ def _info(self) -> tfds.core.DatasetInfo: def _split_generators(self, dl_manager: tfds.download.DownloadManager): """Returns SplitGenerators.""" del dl_manager - return {"train": self._generate_examples(_INPUT_PATH)} - - -def test_generate_example(): - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT - expected_examples = [] + return {"train": self._generate_examples("/tmp/input.txt")} + + +def test_generate_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input.txt" + input_path.write_text(_VALID_INPUT) + + dataset = DummyConllUDataset(data_dir=tmpdir) + + examples = list(dataset._generate_examples(input_path)) + expected_examples = [ + ( + 0, + { + "idx": "VIT-9558", + "text": "Il futuro.", + "tokens": ["Il", "futuro", "."], + "lemmas": ["il", "futuro", "."], + "upos": ["DET", "NOUN", "PUNCT"], + "xpos": ["RD", "S", "FS"], + "feats": [ + "{'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', " # pylint:disable=implicit-str-concat + "'PronType': 'Art'}", + "{'Gender': 'Masc', 'Number': 'Sing'}", + "None", + ], + "head": ["2", "0", "2"], + "deprel": ["det", "root", "punct"], + "deps": ["None", "None", "None"], + "misc": ["None", "{'SpaceAfter': 'No'}", "None"], + }, + ), + ( + 1, + { + "idx": "VIT-9478", + "text": "il responsabile dell'ambiente", + "tokens": [ + "il", + "responsabile", + "dell'", + "di", + "l'", + "ambiente", + ], + "lemmas": ["il", "responsabile", "_", "di", "il", "ambiente"], + "upos": ["DET", "NOUN", "_", "ADP", "DET", "NOUN"], + "xpos": ["RD", "S", "None", "E", "RD", "S"], + "feats": [ + "{'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', " # pylint:disable=implicit-str-concat + "'PronType': 'Art'}", + "{'Number': 'Sing'}", + "None", + "None", + "{'Definite': 'Def', 'Number': 'Sing', 'PronType': 'Art'}", + "{'Gender': 'Masc', 'Number': 'Sing'}", + ], + "head": ["3", "1", "None", "6", "6", "3"], + "deprel": ["det", "nsubj", "_", "case", "det", "nmod"], + "deps": ["None", "None", "None", "None", "None", "None"], + "misc": [ + "None", + "None", + "{'SpaceAfter': 'No'}", + "None", + "None", + "None", + ], + }, + ), + ] - dataset = DummyConllUDataset() + assert examples == expected_examples - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_VALID_INPUT) - examples = list(dataset._generate_examples(_INPUT_PATH)) - expected_examples = [ - ( - 0, - { - "idx": "VIT-9558", - "text": "Il futuro.", - "tokens": ["Il", "futuro", "."], - "lemmas": ["il", "futuro", "."], - "upos": ["DET", "NOUN", "PUNCT"], - "xpos": ["RD", "S", "FS"], - "feats": [ - "{'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', " # pylint:disable=implicit-str-concat - "'PronType': 'Art'}", - "{'Gender': 'Masc', 'Number': 'Sing'}", - "None", - ], - "head": ["2", "0", "2"], - "deprel": ["det", "root", "punct"], - "deps": ["None", "None", "None"], - "misc": ["None", "{'SpaceAfter': 'No'}", "None"], - }, - ), - ( - 1, - { - "idx": "VIT-9478", - "text": "il responsabile dell'ambiente", - "tokens": [ - "il", - "responsabile", - "dell'", - "di", - "l'", - "ambiente", - ], - "lemmas": ["il", "responsabile", "_", "di", "il", "ambiente"], - "upos": ["DET", "NOUN", "_", "ADP", "DET", "NOUN"], - "xpos": ["RD", "S", "None", "E", "RD", "S"], - "feats": [ - "{'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', " # pylint:disable=implicit-str-concat - "'PronType': 'Art'}", - "{'Number': 'Sing'}", - "None", - "None", - "{'Definite': 'Def', 'Number': 'Sing', 'PronType': 'Art'}", - "{'Gender': 'Masc', 'Number': 'Sing'}", - ], - "head": ["3", "1", "None", "6", "6", "3"], - "deprel": ["det", "nsubj", "_", "case", "det", "nmod"], - "deps": ["None", "None", "None", "None", "None", "None"], - "misc": [ - "None", - "None", - "{'SpaceAfter': 'No'}", - "None", - "None", - "None", - ], - }, - ), - ] - - assert examples == expected_examples - - for _, example in examples: - assert len(example) == len(conllu_lib.UNIVERSAL_DEPENDENCIES_FEATURES) + for _, example in examples: + assert len(example) == len(conllu_lib.UNIVERSAL_DEPENDENCIES_FEATURES) assert len(examples) == 2 -def test_generate_corrupted_example(): - conllu = lazy_imports_lib.lazy_imports.conllu +def test_generate_corrupted_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input.txt" + input_path.write_text(_INVALID_INPUT) - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT + conllu = lazy_imports_lib.lazy_imports.conllu dataset = DummyConllUDataset() with pytest.raises(conllu.exceptions.ParseException): - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_INVALID_INPUT) - list(dataset._generate_examples(_INPUT_PATH)) + list(dataset._generate_examples(input_path)) class DummyXtremePosConllUDataset(conllu_dataset_builder.ConllUDatasetBuilder): @@ -194,62 +183,59 @@ def _split_generators(self, dl_manager: tfds.download.DownloadManager): del dl_manager return { "train": self._generate_examples( - filepaths=_INPUT_PATH, + filepaths="/tmp/input.txt", process_example_fn=conllu_dataset_builder.get_xtreme_pos_example, ) } -def test_generate_xtreme_pos_example(): - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT - expected_examples = [] - - dataset = DummyXtremePosConllUDataset() - - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_VALID_INPUT) - examples = list(dataset._generate_examples(_INPUT_PATH)) - expected_examples = [ - ( - 0, - { - "tokens": ["Il", "futuro", "."], - "upos": ["DET", "NOUN", "PUNCT"], - }, - ), - ( - 1, - { - "tokens": [ - "il", - "responsabile", - "dell'", - "di", - "l'", - "ambiente", - ], - "upos": ["DET", "NOUN", "_", "ADP", "DET", "NOUN"], - }, - ), - ] - - assert examples == expected_examples - - for _, example in examples: - assert len(example) == len(conllu_lib.XTREME_POS_FEATURES) +def test_generate_xtreme_pos_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input.txt" + input_path.write_text(_VALID_INPUT) + + dataset = DummyXtremePosConllUDataset(data_dir=tmpdir) + + examples = list(dataset._generate_examples(input_path)) + expected_examples = [ + ( + 0, + { + "tokens": ["Il", "futuro", "."], + "upos": ["DET", "NOUN", "PUNCT"], + }, + ), + ( + 1, + { + "tokens": [ + "il", + "responsabile", + "dell'", + "di", + "l'", + "ambiente", + ], + "upos": ["DET", "NOUN", "_", "ADP", "DET", "NOUN"], + }, + ), + ] + + assert examples == expected_examples + + for _, example in examples: + assert len(example) == len(conllu_lib.XTREME_POS_FEATURES) assert len(examples) == 2 -def test_generate_corrupted_xtreme_pos_example(): +def test_generate_corrupted_xtreme_pos_example(tmpdir): + tmpdir = epath.Path(tmpdir) + input_path = tmpdir / "input.txt" + input_path.write_text(_INVALID_INPUT) conllu = lazy_imports_lib.lazy_imports.conllu - tf_mock = mock.Mock() - tf_mock.gfile.GFile.return_value = _VALID_INPUT - dataset = DummyXtremePosConllUDataset() + dataset = DummyXtremePosConllUDataset(data_dir=tmpdir) with pytest.raises(conllu.exceptions.ParseException): - with tfds.testing.MockFs() as fs: - fs.add_file(path=_INPUT_PATH, content=_INVALID_INPUT) - list(dataset._generate_examples(_INPUT_PATH)) + list(dataset._generate_examples(input_path)) diff --git a/tensorflow_datasets/core/folder_dataset/image_folder.py b/tensorflow_datasets/core/folder_dataset/image_folder.py index 1bd091f910b..b458b0247e0 100644 --- a/tensorflow_datasets/core/folder_dataset/image_folder.py +++ b/tensorflow_datasets/core/folder_dataset/image_folder.py @@ -89,11 +89,11 @@ def __init__( """ self._image_shape = shape self._image_dtype = dtype - super(ImageFolder, self).__init__() + root_dir = os.path.expanduser(root_dir) + super(ImageFolder, self).__init__(data_dir=root_dir) self._data_dir = root_dir # Set data_dir to the existing dir. # Extract the splits, examples, labels - root_dir = os.path.expanduser(root_dir) self._split_examples, labels = _get_split_label_images(root_dir) # Update DatasetInfo labels diff --git a/tensorflow_datasets/core/folder_dataset/translate_folder.py b/tensorflow_datasets/core/folder_dataset/translate_folder.py index 2e7917d9bdd..cff8d958b6a 100644 --- a/tensorflow_datasets/core/folder_dataset/translate_folder.py +++ b/tensorflow_datasets/core/folder_dataset/translate_folder.py @@ -70,7 +70,7 @@ def __init__(self, root_dir: str): root_dir ) - super(TranslateFolder, self).__init__() + super(TranslateFolder, self).__init__(data_dir=root_dir) # Reset `_data_dir` as it should not change to DATA_DIR/Version self._data_dir = root_dir diff --git a/tensorflow_datasets/core/read_only_builder.py b/tensorflow_datasets/core/read_only_builder.py index 2fea8072fc2..aca1b9479c9 100644 --- a/tensorflow_datasets/core/read_only_builder.py +++ b/tensorflow_datasets/core/read_only_builder.py @@ -274,7 +274,7 @@ def builder_from_files( name: str, **builder_kwargs: Any, ) -> dataset_builder.DatasetBuilder: - """Loads a `tfds.core.DatasetBuilder` from files, auto-infering location. + """Loads a `tfds.core.DatasetBuilder` from files, auto-inferring location. This function is similar to `tfds.builder` (same signature), but creates the `tfds.core.DatasetBuilder` directly from files, without loading @@ -336,6 +336,7 @@ def _find_builder_dir(name: str, **builder_kwargs: Any) -> epath.Path | None: name, **builder_kwargs ) version = builder_kwargs.pop('version', None) + version = str(version) if version else None config = builder_kwargs.pop('config', None) data_dir = builder_kwargs.pop('data_dir', None) @@ -492,7 +493,7 @@ def _find_builder_dir_single_dir( config_name=config_name, requested_version=version, ) - if found_version and str(found_version) != version: + if found_version and str(found_version) != str(version): dataset_dir = file_utils.get_dataset_dir( data_dir=data_dir, builder_name=builder_name, diff --git a/tensorflow_datasets/core/utils/file_utils.py b/tensorflow_datasets/core/utils/file_utils.py index bbe3f0bfaec..4ebda84bdfb 100644 --- a/tensorflow_datasets/core/utils/file_utils.py +++ b/tensorflow_datasets/core/utils/file_utils.py @@ -156,7 +156,7 @@ def list_data_dirs( return sorted(d.expanduser() for d in all_data_dirs) -def get_default_data_dir(given_data_dir: str | None = None) -> Path: +def get_default_data_dir(given_data_dir: epath.PathLike | None = None) -> Path: """Returns the default data_dir.""" if given_data_dir: data_dir = os.path.expanduser(given_data_dir) @@ -202,29 +202,36 @@ def get_data_dir_and_dataset_dir( dataset_dir: Dataset data directory (e.g. `///`) """ - all_data_dirs = list_data_dirs(given_data_dir=given_data_dir) - all_versions: set[version_lib.Version] = set() - dataset_dir_by_data_dir: dict[Path, Path] = {} + if version is not None: + version = version_lib.Version(version) + + # If the data dir is explicitly given, no need to search everywhere. + if given_data_dir is not None: + given_data_dir = epath.Path(given_data_dir) + given_dataset_dir = get_dataset_dir( + data_dir=given_data_dir, + builder_name=builder_name, + config_name=config_name, + version=version, + ) + return given_data_dir, given_dataset_dir - for data_dir in all_data_dirs: + # Check whether the dataset is in other data dirs. + dataset_dir_by_data_dir: dict[Path, Path] = {} + all_found_versions: set[version_lib.Version] = set() + for data_dir in list_data_dirs(given_data_dir=None): data_dir = Path(data_dir) - # List all existing versions - dataset_config_dir = get_dataset_dir( + dataset_dir = get_dataset_dir( data_dir=data_dir, builder_name=builder_name, config_name=config_name, version=None, ) - versions = version_lib.list_all_versions(dataset_config_dir) - # Check for existence of the requested version - if version in versions: - dataset_dir_by_data_dir[data_dir] = get_dataset_dir( - data_dir=data_dir, - builder_name=builder_name, - config_name=config_name, - version=version, - ) - all_versions.update(versions) + # Get all versions of the dataset in this dataset dir. + found_versions = version_lib.list_all_versions(dataset_dir) + if version in found_versions: + dataset_dir_by_data_dir[data_dir] = dataset_dir / str(version) + all_found_versions.update(found_versions) if len(dataset_dir_by_data_dir) > 1: raise ValueError( @@ -237,25 +244,25 @@ def get_data_dir_and_dataset_dir( return next(iter(dataset_dir_by_data_dir.items())) # No dataset found, use default directory - default_data_dir = get_default_data_dir(given_data_dir=given_data_dir) + default_data_dir = get_default_data_dir() dataset_dir = get_dataset_dir( data_dir=default_data_dir, builder_name=builder_name, config_name=config_name, version=version, ) - if all_versions: + if all_found_versions: logging.warning( ( 'Found a different version of the requested dataset' - ' (given_data_dir=%s,dataset=%s, config=%s, version=%s):\n' - '%s\nUsing %s instead.' + ' (given_data_dir=%s, dataset=%s, config=%s, version=%s):\n' + '%s\nUsing default data dir %s instead.' ), given_data_dir, builder_name, config_name, version, - '\n'.join(str(v) for v in sorted(all_versions)), + '\n'.join(str(v) for v in sorted(all_found_versions)), dataset_dir, ) return default_data_dir, dataset_dir @@ -285,12 +292,15 @@ def _find_files_without_glob( Yields: the matching file paths. """ + if not folder.is_dir(): + return for glob in globs: glob_parts = glob.split('/') if len(glob_parts) == 1: try: - for file in folder.iterdir(): - if file.name in file_names: + for file_name in file_names: + file = folder / file_name + if file.exists(): yield file except OSError: logging.exception('Could not find files in %s', folder) @@ -313,10 +323,10 @@ def _find_files_with_glob( globs: list[str], file_names: list[str], ) -> Iterator[epath.Path]: - """Finds files matching any of the given globs and given file names.""" + """Returns files matching any of the given globs and given file names.""" for glob in globs: - found_files = folder.glob(glob) try: + found_files = folder.glob(glob) for file in found_files: if file.name in file_names: yield file @@ -438,6 +448,32 @@ def _find_references_with_glob( ) +def list_dataset_versions( + dataset_config_dir: epath.PathLike, +) -> list[version_lib.Version]: + """Returns all dataset versions (sorted ascendingly) found in `dataset_config_dir`. + + Checks whether the version is a valid TFDS version and whether the folder + contains a dataset_info.json file. + + Arguments: + dataset_config_dir: the folder that contains version subfolders. + """ + dataset_config_dir = epath.Path(dataset_config_dir) + glob = f'*/{constants.DATASET_INFO_FILENAME}' + found_versions: list[version_lib.Version] = [] + for dataset_info_file in _find_files_with_glob( + dataset_config_dir, + globs=[glob], + file_names=[constants.DATASET_INFO_FILENAME], + ): + print(f'XXXXXXXX: {dataset_info_file=}') + version_folder = dataset_info_file.parent.name + if version_lib.Version.is_valid(version_folder): + found_versions.append(version_lib.Version(version_folder)) + return sorted(found_versions) + + def list_dataset_variants( dataset_dir: epath.PathLike, namespace: str | None = None, diff --git a/tensorflow_datasets/core/utils/file_utils_test.py b/tensorflow_datasets/core/utils/file_utils_test.py index a34b968123d..2ac3d964525 100644 --- a/tensorflow_datasets/core/utils/file_utils_test.py +++ b/tensorflow_datasets/core/utils/file_utils_test.py @@ -24,6 +24,7 @@ from tensorflow_datasets.core import constants from tensorflow_datasets.core import naming from tensorflow_datasets.core.utils import file_utils +from tensorflow_datasets.core.utils import version as version_lib _DATA_DIR = epath.Path('/a') _DATASET_NAME = 'my_ds' @@ -162,6 +163,34 @@ def _add_features( mock_fs.add_file(dataset_dir / constants.FEATURES_FILENAME, content=content) +def _touch(path: epath.Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.touch() + + +def test_list_dataset_versions(tmpdir): + tmpdir = epath.Path(tmpdir) + _touch(tmpdir / '1.0.0' / constants.DATASET_INFO_FILENAME) + _touch(tmpdir / '1.0.1' / constants.DATASET_INFO_FILENAME) + _touch(tmpdir / '3.0.0' / constants.DATASET_INFO_FILENAME) + + # Does not have dataset_info.json, so ignored. + _touch(tmpdir / '4.0.0' / 'other_file.json') + + # Version folder is inside a subfolder, so ignored. + _touch(tmpdir / 'xxx' / '1.0.0' / constants.DATASET_INFO_FILENAME) + + # Subfolder name is not a valid version, so ignored. + _touch(tmpdir / 'not_valid_version' / constants.DATASET_INFO_FILENAME) + + actual_versions = file_utils.list_dataset_versions(tmpdir) + assert actual_versions == [ + version_lib.Version('1.0.0'), + version_lib.Version('1.0.1'), + version_lib.Version('3.0.0'), + ] + + def test_list_dataset_variants_with_configs(mock_fs: testing.MockFs): configs_and_versions = { 'x': [_VERSION, '1.0.1'],