From 3ec7a08ac4739c3898149746fd444ea6a17212a7 Mon Sep 17 00:00:00 2001 From: Tim Sweeney Date: Wed, 22 Jan 2025 11:42:43 -0800 Subject: [PATCH] Functionality Checkpoint --- tests/trace/test_dataset.py | 17 +++++++++++++++++ tests/trace/test_evaluate.py | 5 +++++ weave/trace/vals.py | 28 ++++++++++++++++++---------- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/tests/trace/test_dataset.py b/tests/trace/test_dataset.py index d55af19c0f37..2a8dc03cce6b 100644 --- a/tests/trace/test_dataset.py +++ b/tests/trace/test_dataset.py @@ -37,6 +37,10 @@ def test_pythonic_access(client): def test_dataset_laziness(client): + """ + The intention of this test is to show that local construction of + a dataset does not trigger any remote operations. + """ dataset = Dataset(rows=[{"input": i} for i in range(300)]) log = client.server.attribute_access_log assert [l for l in log if not l.startswith("_")] == ["ensure_project_exists"] @@ -60,6 +64,12 @@ def test_dataset_laziness(client): def test_published_dataset_laziness(client): + """ + The intention of this test is to show that publishing a dataset, + then iterating through the "gotten" version of the dataset has + minimal remote operations - and importantly delays the fetching + of the rows until they are actually needed. + """ dataset = Dataset(rows=[{"input": i} for i in range(300)]) log = client.server.attribute_access_log assert [l for l in log if not l.startswith("_")] == ["ensure_project_exists"] @@ -89,6 +99,13 @@ def test_published_dataset_laziness(client): i = 0 for row in dataset: log = client.server.attribute_access_log + # This is the critical part of the test - ensuring that + # the rows are only fetched when they are actually needed. + # + # In a future improvement, we might eagerly fetch the next + # page of results, which would result in this assertion changing + # in that there would always be one more "table_query" than + # the number of pages. assert [l for l in log if not l.startswith("_")] == ["table_query"] * ( (i // 100) + 1 ) diff --git a/tests/trace/test_evaluate.py b/tests/trace/test_evaluate.py index 80578fa96b25..2a2ca4d407d2 100644 --- a/tests/trace/test_evaluate.py +++ b/tests/trace/test_evaluate.py @@ -289,6 +289,11 @@ def score(self, target, model_output): def test_evaluate_table_lazy_iter(client): + """ + The intention of this test is to show that an evaluation harness + lazily fetches rows from a table rather than eagerly fetching all + rows up front. + """ dataset = Dataset(rows=[{"input": i} for i in range(300)]) ref = weave.publish(dataset) dataset = ref.get() diff --git a/weave/trace/vals.py b/weave/trace/vals.py index 6e981282332c..f90df4eec95f 100644 --- a/weave/trace/vals.py +++ b/weave/trace/vals.py @@ -313,10 +313,14 @@ def rows(self, value: ROWS_TYPES) -> None: self._rows = value self._mark_dirty() - def _ensure_rows_are_local_list(self) -> list[dict]: - # Any uses of this are signs of a design problem arising - # from a remote table clashing with the need to feel like a - # local list. + def _inefficiently_materialize_rows_as_list(self) -> list[dict]: + # This method is named `inefficiently` to warn callers that + # it should be avoided. We have this nasty paradigm where sometimes + # a WeaveTable needs to act like a list, but it is actually a remote + # table. This method will force iteration through the remote data + # and materialize it into a list. Any uses of this are signs of a design + # problem arising from a remote table clashing with the need to feel like + # a local list. if not isinstance(self.rows, list): self.rows = list(self.rows) return self.rows @@ -356,7 +360,10 @@ def __len__(self) -> int: self._known_length = self._fetch_remote_length() return self._known_length - rows_as_list = self._ensure_rows_are_local_list() + # Finally, if we have no table ref, we can still get the length + # by materializing the rows as a list. I actually think this + # can never happen, but it is here for completeness. + rows_as_list = self._inefficiently_materialize_rows_as_list() return len(rows_as_list) def _fetch_remote_length(self) -> int: @@ -477,9 +484,10 @@ def _remote_iter(self) -> Generator[dict, None, None]: page_index += 1 def __getitem__(self, key: Union[int, slice, str]) -> Any: - # TODO: we should have a better caching strategy that allows - # partial iteration over the table. - rows = self._ensure_rows_are_local_list() + # TODO: ideally we would have some sort of intelligent + # LRU style caching that allows us to minimize materialization + # of the rows as a list. + rows = self._inefficiently_materialize_rows_as_list() if isinstance(key, (int, slice)): return rows[key] @@ -494,14 +502,14 @@ def __iter__(self) -> Iterator[dict]: return iter(self.rows) def append(self, val: dict) -> None: - rows = self._ensure_rows_are_local_list() + rows = self._inefficiently_materialize_rows_as_list() if not isinstance(val, dict): raise TypeError("Can only append dicts to tables") self._mark_dirty() rows.append(val) def pop(self, index: int) -> None: - rows = self._ensure_rows_are_local_list() + rows = self._inefficiently_materialize_rows_as_list() self._mark_dirty() rows.pop(index)