diff --git a/src/unitxt/loaders.py b/src/unitxt/loaders.py index 62c49d89f..f9bb2878d 100644 --- a/src/unitxt/loaders.py +++ b/src/unitxt/loaders.py @@ -57,7 +57,6 @@ IterableDataset, IterableDatasetDict, get_dataset_split_names, - load_dataset_builder, ) from datasets import load_dataset as hf_load_dataset from huggingface_hub import HfApi @@ -168,7 +167,7 @@ def load_data(self) -> MultiStream: self.__class__._loader_cache.max_size = settings.loader_cache_size self.__class__._loader_cache[str(self)] = iterables if isoftype(iterables, Dict[str, ReusableGenerator]): - return MultiStream.from_generators(iterables) + return MultiStream.from_generators(iterables, copying=True) return MultiStream.from_iterables(iterables, copying=True) def process(self) -> MultiStream: @@ -476,11 +475,15 @@ def load_iterables(self): } def split_generator(self, split: str) -> Generator: - split_data = self.downloader(subset=split) - targets = [split_data["target_names"][t] for t in split_data["target"]] - df = pd.DataFrame([split_data["data"], targets]).T - df.columns = ["data", "target"] - dataset = df.to_dict("records") + dataset = self.__class__._loader_cache.get(str(self) + "_" + split, None) + if dataset is None: + split_data = self.downloader(subset=split) + targets = [split_data["target_names"][t] for t in split_data["target"]] + df = pd.DataFrame([split_data["data"], targets]).T + df.columns = ["data", "target"] + dataset = df.to_dict("records") + self.__class__._loader_cache.max_size = settings.loader_cache_size + self.__class__._loader_cache[str(self) + "_" + split] = dataset yield from dataset diff --git a/utils/.secrets.baseline b/utils/.secrets.baseline index 06ff1a310..6c495f8ec 100644 --- a/utils/.secrets.baseline +++ b/utils/.secrets.baseline @@ -151,7 +151,7 @@ "filename": "src/unitxt/loaders.py", "hashed_secret": "840268f77a57d5553add023cfa8a4d1535f49742", "is_verified": false, - "line_number": 566, + "line_number": 572, "is_secret": false } ], @@ -184,5 +184,5 @@ } ] }, - "generated_at": "2025-01-22T20:27:31Z" + "generated_at": "2025-01-23T10:07:40Z" }