♻️ refactor: migrate chunked prefill attention to SparsePolicy

Move all chunked prefill attention computation from attention.py to SparsePolicy.compute_chunked_attention(). This is the v4 architecture refactoring for sparse attention policies. Changes: - Add compute_chunked_attention abstract method to SparsePolicy base - Add offload_engine parameter to select_blocks for policies needing KV access during block selection - Implement compute_chunked_attention in FullAttentionPolicy with complete ring buffer pipeline logic - Simplify attention.py to delegate all chunked prefill to policy - Remove redundant _sync_load_previous_chunks and _ring_buffer_pipeline_load methods from Attention class Test: test_needle.py --enable-offload PASSED Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 00:58:46 +08:00
parent 6783a45e6f
commit baa4be7e2e
4 changed files with 240 additions and 297 deletions
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -7,12 +7,17 @@ from CPU for each query chunk during chunked attention computation.

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional, Any
+from typing import List, Optional, Any, TYPE_CHECKING
 import torch

 # Import SparsePolicyType from config to avoid circular imports
 from nanovllm.config import SparsePolicyType

+if TYPE_CHECKING:
+    from nanovllm.kvcache.offload_engine import OffloadEngine
+    from nanovllm.kvcache.manager import KVCacheManager
+    from nanovllm.engine.sequence import Sequence
+

@dataclass
 class PolicyContext:
@@ -107,6 +112,7 @@ class SparsePolicy(ABC):
    def select_blocks(
        self,
        available_blocks: List[int],
+        offload_engine: "OffloadEngine",
        ctx: PolicyContext,
    ) -> List[int]:
        """
@@ -120,6 +126,8 @@ class SparsePolicy(ABC):
            available_blocks: List of CPU block IDs that contain KV cache
                             from previous chunks. These are ordered by
                             their position in the sequence.
+            offload_engine: OffloadEngine for loading KV (some policies need
+                           to load KV to make selection decisions).
            ctx: PolicyContext with information about the current query
                 chunk, layer, phase (prefill/decode), etc.

@@ -183,5 +191,47 @@ class SparsePolicy(ABC):
        """
        pass

+    @abstractmethod
+    def compute_chunked_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer_id: int,
+        softmax_scale: float,
+        offload_engine: "OffloadEngine",
+        kvcache_manager: "KVCacheManager",
+        current_chunk_idx: int,
+        seq: "Sequence",
+        num_tokens: int,
+    ) -> torch.Tensor:
+        """
+        Compute chunked prefill attention (complete flow).
+
+        This is the main entry point for prefill attention computation.
+        It defines the complete prefill flow:
+        1. Get historical blocks
+        2. Select blocks (call select_blocks)
+        3. Load and compute historical blocks via offload_engine
+        4. Get current chunk KV from offload_engine, compute attention
+        5. Merge all results
+
+        Args:
+            q: [seq_len, num_heads, head_dim] query for current chunk
+            k: [seq_len, num_kv_heads, head_dim] key for current chunk (in prefill buffer)
+            v: [seq_len, num_kv_heads, head_dim] value for current chunk (in prefill buffer)
+            layer_id: transformer layer index
+            softmax_scale: softmax scaling factor
+            offload_engine: OffloadEngine for loading blocks
+            kvcache_manager: KVCacheManager for block management
+            current_chunk_idx: current chunk index
+            seq: Sequence object
+            num_tokens: number of tokens in current chunk
+
+        Returns:
+            [seq_len, num_heads, head_dim] final attention output
+        """
+        pass
+
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"