[WIP] Before plan execute.

2026-01-19 03:30:44 +08:00
parent e6e0dc5d7d
commit 9e6fdc0650
5 changed files with 377 additions and 10 deletions
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -207,8 +207,10 @@ class Attention(nn.Module):
            # Get prefilled CPU blocks (blocks from previous chunks)
            cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)

-            # Apply sparse policy if enabled (Quest returns all blocks for prefill since query=None)
+            # Apply sparse policy if enabled
            sparse_policy = kvcache_manager.sparse_policy
+
+            # === Standard sparse policy (Quest, etc.) ===
            if cpu_block_table and sparse_policy is not None:
                num_chunks = getattr(context, 'num_chunks', current_chunk_idx + 1)
                policy_ctx = PolicyContext(