Skip to content

Commit

Permalink
Functionality Checkpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
tssweeney committed Jan 22, 2025
1 parent 64b12c5 commit 3ec7a08
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 10 deletions.
17 changes: 17 additions & 0 deletions tests/trace/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ def test_pythonic_access(client):


def test_dataset_laziness(client):
"""
The intention of this test is to show that local construction of
a dataset does not trigger any remote operations.
"""
dataset = Dataset(rows=[{"input": i} for i in range(300)])
log = client.server.attribute_access_log
assert [l for l in log if not l.startswith("_")] == ["ensure_project_exists"]
Expand All @@ -60,6 +64,12 @@ def test_dataset_laziness(client):


def test_published_dataset_laziness(client):
"""
The intention of this test is to show that publishing a dataset,
then iterating through the "gotten" version of the dataset has
minimal remote operations - and importantly delays the fetching
of the rows until they are actually needed.
"""
dataset = Dataset(rows=[{"input": i} for i in range(300)])
log = client.server.attribute_access_log
assert [l for l in log if not l.startswith("_")] == ["ensure_project_exists"]
Expand Down Expand Up @@ -89,6 +99,13 @@ def test_published_dataset_laziness(client):
i = 0
for row in dataset:
log = client.server.attribute_access_log
# This is the critical part of the test - ensuring that
# the rows are only fetched when they are actually needed.
#
# In a future improvement, we might eagerly fetch the next
# page of results, which would result in this assertion changing
# in that there would always be one more "table_query" than
# the number of pages.
assert [l for l in log if not l.startswith("_")] == ["table_query"] * (
(i // 100) + 1
)
Expand Down
5 changes: 5 additions & 0 deletions tests/trace/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ def score(self, target, model_output):


def test_evaluate_table_lazy_iter(client):
"""
The intention of this test is to show that an evaluation harness
lazily fetches rows from a table rather than eagerly fetching all
rows up front.
"""
dataset = Dataset(rows=[{"input": i} for i in range(300)])
ref = weave.publish(dataset)
dataset = ref.get()
Expand Down
28 changes: 18 additions & 10 deletions weave/trace/vals.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,10 +313,14 @@ def rows(self, value: ROWS_TYPES) -> None:
self._rows = value
self._mark_dirty()

def _ensure_rows_are_local_list(self) -> list[dict]:
# Any uses of this are signs of a design problem arising
# from a remote table clashing with the need to feel like a
# local list.
def _inefficiently_materialize_rows_as_list(self) -> list[dict]:
# This method is named `inefficiently` to warn callers that
# it should be avoided. We have this nasty paradigm where sometimes
# a WeaveTable needs to act like a list, but it is actually a remote
# table. This method will force iteration through the remote data
# and materialize it into a list. Any uses of this are signs of a design
# problem arising from a remote table clashing with the need to feel like
# a local list.
if not isinstance(self.rows, list):
self.rows = list(self.rows)
return self.rows
Expand Down Expand Up @@ -356,7 +360,10 @@ def __len__(self) -> int:
self._known_length = self._fetch_remote_length()
return self._known_length

rows_as_list = self._ensure_rows_are_local_list()
# Finally, if we have no table ref, we can still get the length
# by materializing the rows as a list. I actually think this
# can never happen, but it is here for completeness.
rows_as_list = self._inefficiently_materialize_rows_as_list()
return len(rows_as_list)

def _fetch_remote_length(self) -> int:
Expand Down Expand Up @@ -477,9 +484,10 @@ def _remote_iter(self) -> Generator[dict, None, None]:
page_index += 1

def __getitem__(self, key: Union[int, slice, str]) -> Any:
# TODO: we should have a better caching strategy that allows
# partial iteration over the table.
rows = self._ensure_rows_are_local_list()
# TODO: ideally we would have some sort of intelligent
# LRU style caching that allows us to minimize materialization
# of the rows as a list.
rows = self._inefficiently_materialize_rows_as_list()

if isinstance(key, (int, slice)):
return rows[key]
Expand All @@ -494,14 +502,14 @@ def __iter__(self) -> Iterator[dict]:
return iter(self.rows)

def append(self, val: dict) -> None:
rows = self._ensure_rows_are_local_list()
rows = self._inefficiently_materialize_rows_as_list()
if not isinstance(val, dict):
raise TypeError("Can only append dicts to tables")
self._mark_dirty()
rows.append(val)

def pop(self, index: int) -> None:
rows = self._ensure_rows_are_local_list()
rows = self._inefficiently_materialize_rows_as_list()
self._mark_dirty()
rows.pop(index)

Expand Down

0 comments on commit 3ec7a08

Please sign in to comment.