[feat] Need to optimized with async prefetch.

2025-12-15 06:58:40 +08:00
parent 1081ab51ea
commit b8b6478506
9 changed files with 556 additions and 404 deletions
--- a/nanovllm/utils/context.py
+++ b/nanovllm/utils/context.py
@@ -32,6 +32,8 @@ class Context:
    # Starting position within block where decode tokens began (for accumulated token tracking)
    # Used when batching decode offloads - we need to attend to all accumulated tokens
    decode_start_pos_in_block: int = 0
+    # Current chunk index for ring buffer pipeline (prefill only)
+    current_chunk_idx: int = 0


 _CONTEXT = Context()
@@ -57,6 +59,7 @@ def set_context(
    chunked_seq=None,
    decode_pos_in_block=0,
    decode_start_pos_in_block=0,
+    current_chunk_idx=0,
 ):
    global _CONTEXT
    _CONTEXT = Context(
@@ -75,6 +78,7 @@ def set_context(
        chunked_seq=chunked_seq,
        decode_pos_in_block=decode_pos_in_block,
        decode_start_pos_in_block=decode_start_pos_in_block,
+        current_chunk_idx=current_chunk_idx,
    )