Zero-fill start positions instead of one-filling (#826)

PRs in the history of this problem: #665, #723 #665 is supposed to fix a NaN cache corruption issue by 1-filling seq_len instead of 0-filling. Its supposed to 1-fill seq_len for decode and prefill, but I mistakenly 1-filled seq_len for decode only, and also 1-filled the start_position for decode instead of prefill seq_len. #723 adds 1-filling for prefill, and this PR removes the mistaken start_positions 1-filling for decode. After this PR shortfin concurrent tests should be working properly. Up next: a failing trie kv sharing test case.
nod-ai · Jan 15, 2025 · e34ffec · e34ffec
1 parent cbcff3d
commit e34ffec
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 11 deletions.
diff --git a/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py b/app_tests/integration_tests/llm/shortfin/cpu_llm_server_test.py
@@ -85,10 +85,6 @@ def test_basic_generation(self, server: tuple[Any, int]) -> None:
         indirect=True,
     )
     @pytest.mark.parametrize("concurrent_requests", [2, 4, 8])
-    @pytest.mark.xfail(
-        raises=AccuracyValidationException,
-        reason="Concurreny issues in Shortfin batch processing",
-    )
     def test_concurrent_generation(
         self, server: tuple[Any, int], concurrent_requests: int
     ) -> None:

diff --git a/shortfin/python/shortfin_apps/llm/components/service.py b/shortfin/python/shortfin_apps/llm/components/service.py
@@ -388,22 +388,19 @@ async def run(self):
                 seq_lens_host.copy_to(seq_lens)
 
             # For decode, populate start_positions and seq_lens.
-            # paged_llm_v1 and export_paged_llm_v1 do some funky things with start_positions and seq_lens
-            # TODO: make them not so funky
             if self.phase == InferencePhase.DECODE:
                 start_positions_host = start_positions.for_transfer()
                 with start_positions_host.map(discard=True) as m:
-                    m.fill(
-                        1
-                    )  # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
+                    m.fill(0)
                     m.items = [req.start_position for req in self.exec_requests]
                 start_positions_host.copy_to(start_positions)
 
                 seq_lens_host = seq_lens.for_transfer()
                 with seq_lens_host.map(discard=True) as m:
+                    # Pad unused requests.
                     m.fill(
-                        1
-                    )  # Pad unused requests. Must pad with nonzero value because division by 0 floods clobber page (page 0) in cache with NaN values.
+                        1  # Must pad with a nonzero value because a division by 0 during softmax floods clobber page (page 0) in cache with NaN values.
+                    )
                     m.items = [
                         req.start_position + len(req.input_token_ids)
                         for req in self.exec_requests