♻️ refactor: migrate chunked prefill attention to SparsePolicy

Move all chunked prefill attention computation from attention.py to
SparsePolicy.compute_chunked_attention(). This is the v4 architecture
refactoring for sparse attention policies.

Changes:
- Add compute_chunked_attention abstract method to SparsePolicy base
- Add offload_engine parameter to select_blocks for policies needing
  KV access during block selection
- Implement compute_chunked_attention in FullAttentionPolicy with
  complete ring buffer pipeline logic
- Simplify attention.py to delegate all chunked prefill to policy
- Remove redundant _sync_load_previous_chunks and
  _ring_buffer_pipeline_load methods from Attention class

Test: test_needle.py --enable-offload PASSED

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Zijie Tian
2026-01-20 00:58:46 +08:00
parent 6783a45e6f
commit baa4be7e2e
4 changed files with 240 additions and 297 deletions

View File

@@ -7,12 +7,17 @@ from CPU for each query chunk during chunked attention computation.
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Optional, Any
from typing import List, Optional, Any, TYPE_CHECKING
import torch
# Import SparsePolicyType from config to avoid circular imports
from nanovllm.config import SparsePolicyType
if TYPE_CHECKING:
from nanovllm.kvcache.offload_engine import OffloadEngine
from nanovllm.kvcache.manager import KVCacheManager
from nanovllm.engine.sequence import Sequence
@dataclass
class PolicyContext:
@@ -107,6 +112,7 @@ class SparsePolicy(ABC):
def select_blocks(
self,
available_blocks: List[int],
offload_engine: "OffloadEngine",
ctx: PolicyContext,
) -> List[int]:
"""
@@ -120,6 +126,8 @@ class SparsePolicy(ABC):
available_blocks: List of CPU block IDs that contain KV cache
from previous chunks. These are ordered by
their position in the sequence.
offload_engine: OffloadEngine for loading KV (some policies need
to load KV to make selection decisions).
ctx: PolicyContext with information about the current query
chunk, layer, phase (prefill/decode), etc.
@@ -183,5 +191,47 @@ class SparsePolicy(ABC):
"""
pass
@abstractmethod
def compute_chunked_attention(
self,
q: torch.Tensor,
k: torch.Tensor,
v: torch.Tensor,
layer_id: int,
softmax_scale: float,
offload_engine: "OffloadEngine",
kvcache_manager: "KVCacheManager",
current_chunk_idx: int,
seq: "Sequence",
num_tokens: int,
) -> torch.Tensor:
"""
Compute chunked prefill attention (complete flow).
This is the main entry point for prefill attention computation.
It defines the complete prefill flow:
1. Get historical blocks
2. Select blocks (call select_blocks)
3. Load and compute historical blocks via offload_engine
4. Get current chunk KV from offload_engine, compute attention
5. Merge all results
Args:
q: [seq_len, num_heads, head_dim] query for current chunk
k: [seq_len, num_kv_heads, head_dim] key for current chunk (in prefill buffer)
v: [seq_len, num_kv_heads, head_dim] value for current chunk (in prefill buffer)
layer_id: transformer layer index
softmax_scale: softmax scaling factor
offload_engine: OffloadEngine for loading blocks
kvcache_manager: KVCacheManager for block management
current_chunk_idx: current chunk index
seq: Sequence object
num_tokens: number of tokens in current chunk
Returns:
[seq_len, num_heads, head_dim] final attention output
"""
pass
def __repr__(self) -> str:
return f"{self.__class__.__name__}()"