Merge branch 'tzj/minference' of ssh://git.zijie-tian.site:2222/zijie-tian/nano-vllm into tzj/minference

2026-01-20 02:16:39 +08:00
parent 16fbcf9e4c a36f8569fc
commit b1f292cf22
21 changed files with 1743 additions and 698 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -7,8 +7,9 @@ import torch

 class SparsePolicyType(Enum):
    """Sparse attention policy types."""
-    FULL = auto()   # No sparse attention (load all blocks)
-    QUEST = auto()  # Query-aware Top-K block selection (decode only)
+    FULL = auto()       # No sparse attention (load all blocks)
+    QUEST = auto()      # Query-aware Top-K block selection (decode only)
+    XATTN_BSA = auto()  # XAttention Block Sparse Attention (prefill only, chunked)


@dataclass
@@ -37,12 +38,20 @@ class Config:
    num_cpu_kvcache_blocks: int = -1

    # Sparse attention configuration
-    # Quest: decode-only sparse attention with Top-K block selection
    # FULL: no sparse attention (load all blocks)
+    # QUEST: decode-only sparse attention with Top-K block selection
+    # XATTN_BSA: prefill-only block sparse attention with chunk-level selection
    sparse_policy: SparsePolicyType = SparsePolicyType.FULL
    sparse_topk_blocks: int = 8  # Top-K blocks for Quest
    sparse_threshold_blocks: int = 4  # Apply sparse only when blocks > threshold

+    # XAttention BSA specific parameters
+    sparse_block_size: int = 128  # Block size for BSA (tokens per block)
+    sparse_samples_per_chunk: int = 128  # Samples per chunk for estimation
+    sparse_threshold: float = 0.9  # Cumulative attention threshold (0-1)
+    sparse_use_triton: bool = True  # Use Triton kernels for estimation
+    sparse_stride: int = 8  # Stride for Q/K downsampling
+
    def __post_init__(self):
        assert os.path.isdir(self.model)
        assert self.kvcache_block_size % 256 == 0
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -142,8 +142,26 @@ class ModelRunner:
        block_bytes = 2 * hf_config.num_hidden_layers * self.block_size * num_kv_heads * head_dim * hf_config.torch_dtype.itemsize

        # Calculate max GPU blocks based on available memory
-        max_gpu_blocks = int(total * config.gpu_memory_utilization - used - peak + current) // block_bytes
-        assert max_gpu_blocks > 0
+        # In CPU offload mode with shared GPU, use actual free memory instead of total * utilization
+        if config.enable_cpu_offload and used > total * 0.5:
+            # GPU is shared with other processes, use actual free memory
+            available_memory = free * 0.9  # Leave 10% buffer
+        else:
+            # Standard calculation for dedicated GPU usage
+            available_memory = total * config.gpu_memory_utilization - used - peak + current
+
+        max_gpu_blocks = int(available_memory) // block_bytes
+
+        if max_gpu_blocks <= 0:
+            raise RuntimeError(
+                f"Insufficient GPU memory for KV cache allocation. "
+                f"Total: {total/1024**3:.2f} GB, "
+                f"Used by other processes: {used/1024**3:.2f} GB, "
+                f"Free: {free/1024**3:.2f} GB, "
+                f"Available: {available_memory/1024**3:.2f} GB, "
+                f"Required per block: {block_bytes/1024**2:.2f} MB. "
+                f"Try waiting for GPU to be available or reduce model size."
+            )

        # Determine final GPU blocks: user-specified or auto (max available)
        if config.num_gpu_blocks > 0:
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -64,11 +64,24 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    # Create sparse policy from config enum
    # Quest is decode-only: prefill returns all blocks (query=None), decode does Top-K
    sparse_policy_type = getattr(config, 'sparse_policy', SparsePolicyType.FULL)
-    sparse_policy = create_sparse_policy(
-        sparse_policy_type,
-        topk_blocks=getattr(config, 'sparse_topk_blocks', 8),
-        threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
-    )
+
+    # Build policy kwargs based on policy type
+    policy_kwargs = {}
+    if sparse_policy_type == SparsePolicyType.QUEST:
+        policy_kwargs = {
+            'topk_blocks': getattr(config, 'sparse_topk_blocks', 8),
+            'threshold_blocks': getattr(config, 'sparse_threshold_blocks', 4),
+        }
+    elif sparse_policy_type == SparsePolicyType.XATTN_BSA:
+        policy_kwargs = {
+            'block_size': getattr(config, 'sparse_block_size', 128),
+            'samples_per_chunk': getattr(config, 'sparse_samples_per_chunk', 128),
+            'threshold': getattr(config, 'sparse_threshold', 0.9),
+            'use_triton': getattr(config, 'sparse_use_triton', True),
+            'stride': getattr(config, 'sparse_stride', 8),
+        }
+
+    sparse_policy = create_sparse_policy(sparse_policy_type, **policy_kwargs)

    return HybridKVCacheManager(
        num_gpu_slots=num_gpu_blocks,
--- a/nanovllm/kvcache/offload_engine.py
+++ b/nanovllm/kvcache/offload_engine.py
@@ -905,3 +905,60 @@ class OffloadEngine:
    def wait_prefill_offload(self, layer_id: int) -> None:
        """Wait for a specific layer's prefill offload to complete."""
        self.prefill_offload_events[layer_id].synchronize()
+
+    # ========== XAttention BSA Helper Methods ==========
+
+    def load_block_sample_from_cpu(
+        self,
+        cpu_block_id: int,
+        layer_id: int,
+        num_samples: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Load sample tokens from a CPU block for XAttention BSA estimation.
+
+        This is used in the estimate phase of XAttention BSA to load a small
+        sample of tokens from each historical chunk for importance estimation.
+
+        Args:
+            cpu_block_id: Source CPU block ID
+            layer_id: Layer index
+            num_samples: Number of tokens to sample
+
+        Returns:
+            (k_sample, v_sample) tensors, shape: [num_samples, kv_heads, head_dim]
+        """
+        # Sample from the beginning of the block
+        k_sample = self.k_cache_cpu[
+            layer_id, cpu_block_id, :num_samples
+        ].clone().cuda()
+        v_sample = self.v_cache_cpu[
+            layer_id, cpu_block_id, :num_samples
+        ].clone().cuda()
+        return k_sample, v_sample
+
+    def load_block_full_from_cpu(
+        self,
+        cpu_block_id: int,
+        layer_id: int,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Load full tokens from a CPU block for XAttention BSA computation.
+
+        This is used in the compute phase of XAttention BSA to load the full
+        data for selected important chunks.
+
+        Args:
+            cpu_block_id: Source CPU block ID
+            layer_id: Layer index
+
+        Returns:
+            (k_full, v_full) tensors, shape: [block_size, kv_heads, head_dim]
+        """
+        k_full = self.k_cache_cpu[
+            layer_id, cpu_block_id
+        ].clone().cuda()
+        v_full = self.v_cache_cpu[
+            layer_id, cpu_block_id
+        ].clone().cuda()
+        return k_full, v_full
--- a/nanovllm/kvcache/sparse/init.py
+++ b/nanovllm/kvcache/sparse/init.py
@@ -23,6 +23,7 @@ from nanovllm.config import SparsePolicyType
 from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
 from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
 from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
+from nanovllm.kvcache.sparse.xattn_bsa import XAttentionBSAPolicy


 def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
@@ -55,6 +56,13 @@ def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolic
        )
        return QuestPolicy(config)

+    elif policy_type == SparsePolicyType.XATTN_BSA:
+        return XAttentionBSAPolicy(
+            block_size=kwargs.get("block_size", 128),
+            samples_per_chunk=kwargs.get("samples_per_chunk", 128),
+            threshold=kwargs.get("threshold", 0.9),
+        )
+
    else:
        raise ValueError(f"Unknown policy type: {policy_type}")

@@ -67,5 +75,6 @@ __all__ = [
    "QuestPolicy",
    "QuestConfig",
    "BlockMetadataManager",
+    "XAttentionBSAPolicy",
    "create_sparse_policy",
 ]
--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -5,8 +5,19 @@ This serves as a baseline and default policy when sparse
 attention is not needed.
 """

-from typing import List
+import logging
+import torch
+from typing import List, Optional, TYPE_CHECKING
+
 from .policy import SparsePolicy, PolicyContext
+from nanovllm.utils.context import get_context
+
+if TYPE_CHECKING:
+    from nanovllm.kvcache.offload_engine import OffloadEngine
+    from nanovllm.kvcache.manager import KVCacheManager
+    from nanovllm.engine.sequence import Sequence
+
+logger = logging.getLogger(__name__)


 class FullAttentionPolicy(SparsePolicy):
@@ -29,10 +40,157 @@ class FullAttentionPolicy(SparsePolicy):
    def select_blocks(
        self,
        available_blocks: List[int],
+        offload_engine: "OffloadEngine",
        ctx: PolicyContext,
    ) -> List[int]:
        """Return all blocks - no sparsity."""
        return available_blocks

+    def compute_chunked_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer_id: int,
+        softmax_scale: float,
+        offload_engine: "OffloadEngine",
+        kvcache_manager: "KVCacheManager",
+        current_chunk_idx: int,
+        seq: "Sequence",
+        num_tokens: int,
+    ) -> torch.Tensor:
+        """
+        Compute full attention for chunked prefill.
+
+        This method handles the complete chunked prefill flow:
+        1. Get historical blocks
+        2. Select blocks via select_blocks
+        3. Load and compute attention to historical chunks
+        4. Compute attention to current chunk
+        5. Merge all results
+
+        Args:
+            q: Query tensor [seq_len, num_heads, head_dim]
+            k: Key tensor [seq_len, num_kv_heads, head_dim] (unused, from prefill buffer)
+            v: Value tensor [seq_len, num_kv_heads, head_dim] (unused, from prefill buffer)
+            layer_id: Current layer index
+            softmax_scale: Softmax scaling factor
+            offload_engine: OffloadEngine for loading blocks
+            kvcache_manager: KVCacheManager for block management
+            current_chunk_idx: Current chunk index
+            seq: Sequence object
+            num_tokens: Number of tokens in current chunk
+
+        Returns:
+            Attention output [seq_len, num_heads, head_dim]
+        """
+        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
+
+        logger.debug(f"[DEBUG] FullPolicy.compute_chunked_attention called, "
+                     f"layer={layer_id}, chunk={current_chunk_idx}, num_tokens={num_tokens}")
+
+        q_batched = q.unsqueeze(0)  # [1, seq_len, num_heads, head_dim]
+        o_acc = None
+        lse_acc = None
+        compute_stream = offload_engine.compute_stream
+
+        # Step 1: Get historical blocks
+        cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)
+
+        # Step 2: Apply select_blocks to filter blocks
+        if cpu_block_table:
+            num_chunks = current_chunk_idx + 1
+            policy_ctx = PolicyContext(
+                query_chunk_idx=current_chunk_idx,
+                num_query_chunks=num_chunks,
+                layer_id=layer_id,
+                query=None,  # Prefill typically doesn't use query for selection
+                is_prefill=True,
+                block_size=kvcache_manager.block_size,
+                total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
+            )
+            cpu_block_table = self.select_blocks(cpu_block_table, offload_engine, policy_ctx)
+            logger.debug(f"[DEBUG] select_blocks: output={len(cpu_block_table)} blocks")
+
+        if cpu_block_table:
+            load_slots = list(range(offload_engine.num_ring_slots))
+            num_blocks = len(cpu_block_table)
+
+            if len(load_slots) == 1:
+                # Only 1 slot - use synchronous mode
+                slot = load_slots[0]
+                for block_idx in range(num_blocks):
+                    cpu_block_id = cpu_block_table[block_idx]
+                    offload_engine.load_to_slot_layer(slot, layer_id, cpu_block_id)
+                    offload_engine.wait_slot_layer(slot)
+
+                    with torch.cuda.stream(compute_stream):
+                        prev_k, prev_v = offload_engine.get_kv_for_slot(slot)
+                        prev_o, prev_lse = flash_attn_with_lse(
+                            q_batched, prev_k, prev_v,
+                            softmax_scale=softmax_scale,
+                            causal=False,
+                        )
+                        if o_acc is None:
+                            o_acc, lse_acc = prev_o, prev_lse
+                        else:
+                            o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
+                        offload_engine.record_slot_compute_done(slot)
+            else:
+                # Multiple slots - use pipeline
+                num_slots = len(load_slots)
+                num_preload = min(num_slots, num_blocks)
+                for i in range(num_preload):
+                    offload_engine.load_to_slot_layer(load_slots[i], layer_id, cpu_block_table[i])
+
+                for block_idx in range(num_blocks):
+                    current_slot = load_slots[block_idx % num_slots]
+                    cpu_block_id = cpu_block_table[block_idx]
+
+                    offload_engine.wait_slot_layer(current_slot)
+
+                    with torch.cuda.stream(compute_stream):
+                        prev_k, prev_v = offload_engine.get_kv_for_slot(current_slot)
+                        prev_o, prev_lse = flash_attn_with_lse(
+                            q_batched, prev_k, prev_v,
+                            softmax_scale=softmax_scale,
+                            causal=False,
+                        )
+                        offload_engine.record_slot_compute_done(current_slot)
+
+                        if o_acc is None:
+                            o_acc, lse_acc = prev_o, prev_lse
+                        else:
+                            o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
+
+                    # Issue next transfer
+                    next_block_idx = block_idx + num_slots
+                    if next_block_idx < num_blocks:
+                        next_slot = load_slots[next_block_idx % num_slots]
+                        next_cpu_block_id = cpu_block_table[next_block_idx]
+                        offload_engine.load_to_slot_layer(next_slot, layer_id, next_cpu_block_id)
+
+        # Step 4: Compute attention to current chunk (causal mask)
+        with torch.cuda.stream(compute_stream):
+            k_curr, v_curr = offload_engine.get_prefill_buffer_slice(layer_id, num_tokens)
+            current_o, current_lse = flash_attn_with_lse(
+                q_batched, k_curr, v_curr,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+        # Step 5: Merge historical and current attention
+        with torch.cuda.stream(compute_stream):
+            if o_acc is None:
+                final_o = current_o
+            else:
+                final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
+
+        # Sync default stream with compute_stream before returning
+        torch.cuda.default_stream().wait_stream(compute_stream)
+
+        # Remove batch dimension: [1, seq_len, num_heads, head_dim] -> [seq_len, num_heads, head_dim]
+        return final_o.squeeze(0)
+
    def __repr__(self) -> str:
        return "FullAttentionPolicy()"
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -7,12 +7,17 @@ from CPU for each query chunk during chunked attention computation.

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import List, Optional, Any
+from typing import List, Optional, Any, TYPE_CHECKING
 import torch

 # Import SparsePolicyType from config to avoid circular imports
 from nanovllm.config import SparsePolicyType

+if TYPE_CHECKING:
+    from nanovllm.kvcache.offload_engine import OffloadEngine
+    from nanovllm.kvcache.manager import KVCacheManager
+    from nanovllm.engine.sequence import Sequence
+

@dataclass
 class PolicyContext:
@@ -35,8 +40,8 @@ class PolicyContext:
    query: Optional[torch.Tensor]
    """
    Query tensor for current chunk.
-    Shape: [1, num_heads, head_dim] for decode, [1, seq_len, num_heads, head_dim] for prefill.
-    May be None if not available (e.g., some prefill scenarios).
+    Shape: [1, num_heads, head_dim] for decode, [seq_len, num_heads, head_dim] for prefill.
+    Available for both prefill and decode phases.
    """

    is_prefill: bool
@@ -107,6 +112,7 @@ class SparsePolicy(ABC):
    def select_blocks(
        self,
        available_blocks: List[int],
+        offload_engine: "OffloadEngine",
        ctx: PolicyContext,
    ) -> List[int]:
        """
@@ -120,6 +126,8 @@ class SparsePolicy(ABC):
            available_blocks: List of CPU block IDs that contain KV cache
                             from previous chunks. These are ordered by
                             their position in the sequence.
+            offload_engine: OffloadEngine for loading KV (some policies need
+                           to load KV to make selection decisions).
            ctx: PolicyContext with information about the current query
                 chunk, layer, phase (prefill/decode), etc.

@@ -183,5 +191,47 @@ class SparsePolicy(ABC):
        """
        pass

+    @abstractmethod
+    def compute_chunked_attention(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer_id: int,
+        softmax_scale: float,
+        offload_engine: "OffloadEngine",
+        kvcache_manager: "KVCacheManager",
+        current_chunk_idx: int,
+        seq: "Sequence",
+        num_tokens: int,
+    ) -> torch.Tensor:
+        """
+        Compute chunked prefill attention (complete flow).
+
+        This is the main entry point for prefill attention computation.
+        It defines the complete prefill flow:
+        1. Get historical blocks
+        2. Select blocks (call select_blocks)
+        3. Load and compute historical blocks via offload_engine
+        4. Get current chunk KV from offload_engine, compute attention
+        5. Merge all results
+
+        Args:
+            q: [seq_len, num_heads, head_dim] query for current chunk
+            k: [seq_len, num_kv_heads, head_dim] key for current chunk (in prefill buffer)
+            v: [seq_len, num_kv_heads, head_dim] value for current chunk (in prefill buffer)
+            layer_id: transformer layer index
+            softmax_scale: softmax scaling factor
+            offload_engine: OffloadEngine for loading blocks
+            kvcache_manager: KVCacheManager for block management
+            current_chunk_idx: current chunk index
+            seq: Sequence object
+            num_tokens: number of tokens in current chunk
+
+        Returns:
+            [seq_len, num_heads, head_dim] final attention output
+        """
+        pass
+
    def __repr__(self) -> str:
        return f"{self.__class__.__name__}()"
--- a/nanovllm/kvcache/sparse/xattn_bsa.py
+++ b/nanovllm/kvcache/sparse/xattn_bsa.py
@@ -0,0 +1,70 @@
+"""
+XAttention Block Sparse Attention (BSA) Policy for nano-vllm.
+
+This module implements XAttention-inspired block sparse attention for chunked prefill.
+Current implementation loads all historical blocks (FULL strategy).
+
+Sparse selection to be implemented in next phase.
+"""
+
+import torch
+from typing import List, Optional, Tuple
+
+from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
+from nanovllm.utils.context import get_context
+
+
+class XAttentionBSAPolicy(SparsePolicy):
+    """
+    XAttention Block Sparse Attention policy for chunked prefill.
+
+    This policy uses block-level estimation to determine which KV blocks
+    are important for the current chunk's queries, enabling sparse computation.
+
+    Note: Current implementation loads all historical chunks (FULL strategy).
+    Sparse selection to be implemented in next phase.
+    """
+
+    supports_prefill = False  # Uses standard select_blocks interface
+    supports_decode = False  # BSA is prefill-only
+    requires_block_selection = False  # Selection happens at chunk level, not block level
+
+    def __init__(
+        self,
+        block_size: int = 128,
+        samples_per_chunk: int = 128,
+        threshold: float = 0.9,
+    ):
+        """
+        Initialize XAttention BSA policy.
+
+        Args:
+            block_size: Number of tokens per block (default: 128)
+            samples_per_chunk: Number of tokens to sample from each historical chunk for estimation
+            threshold: Cumulative attention threshold for chunk selection (0-1)
+        """
+        self.block_size = block_size
+        self.samples_per_chunk = samples_per_chunk
+        self.threshold = threshold
+
+    def select_blocks(self, available_blocks: List[int], ctx: PolicyContext) -> List[int]:
+        """
+        Select blocks to load from CPU.
+
+        Current implementation returns all blocks (FULL strategy).
+        Sparse selection to be implemented in next phase.
+
+        Args:
+            available_blocks: List of all available CPU block IDs
+            ctx: Policy context with query info, chunk index, etc.
+
+        Returns:
+            List of selected block IDs to load
+        """
+        # Current: Return all blocks (FULL strategy)
+        # TODO: Implement sparse selection based on query attention estimation
+        return available_blocks
+
+    def reset(self) -> None:
+        """Reset policy state."""
+        pass
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -174,116 +174,45 @@ class Attention(nn.Module):
        """
        Compute attention with per-layer prefill buffer for async offload.

-        Optimized design:
-        - Current chunk's KV is written to per-layer prefill buffer (not GPU slot)
-        - Previous chunks' KV are loaded from CPU using GPU slots
-        - Each layer offloads from its own buffer - no waiting required!
+        Simplified design:
+        - All computation logic is delegated to sparse_policy.compute_chunked_attention()
+        - This method only handles async offload after computation

-        For each layer:
-        1. Current chunk's KV is in prefill_buffer[layer_id] (just written by model)
-        2. Load previous chunks from CPU using available slots (pipeline)
-        3. Compute attention against previous KV (no causal mask)
-        4. Compute attention against current KV from prefill buffer (causal)
-        5. Merge all results using online softmax
-        6. Async offload prefill buffer to CPU (no waiting!)
+        The policy handles:
+        1. Loading historical blocks from CPU
+        2. Computing attention against historical KV (no causal mask)
+        3. Computing attention against current KV from prefill buffer (causal)
+        4. Merging all results
        """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
-
        current_chunk_idx = context.current_chunk_idx
        torch.cuda.nvtx.range_push(f"ChunkedPrefill: L{self.layer_id} Chunk{current_chunk_idx}")

-        # q shape: [total_tokens, num_heads, head_dim]
-        q_batched = q.unsqueeze(0)  # [1, total_tokens, heads, dim]
        num_tokens = k.shape[0]

-        o_acc = None
-        lse_acc = None
-
        kvcache_manager = context.kvcache_manager
        seq = context.chunked_seq if hasattr(context, 'chunked_seq') else None
        offload_engine = kvcache_manager.offload_engine if kvcache_manager is not None else None

-        if kvcache_manager is not None and seq is not None and self.layer_id >= 0:
-            # Get prefilled CPU blocks (blocks from previous chunks)
-            cpu_block_table = kvcache_manager.get_prefilled_cpu_blocks(seq)
+        # Get sparse policy - required for chunked prefill
+        sparse_policy = kvcache_manager.sparse_policy
+        if sparse_policy is None:
+            raise RuntimeError("sparse_policy is required for chunked prefill")

-            # Apply sparse policy if enabled (Quest returns all blocks for prefill since query=None)
-            sparse_policy = kvcache_manager.sparse_policy
-            if cpu_block_table and sparse_policy is not None:
-                num_chunks = getattr(context, 'num_chunks', current_chunk_idx + 1)
-                policy_ctx = PolicyContext(
-                    query_chunk_idx=current_chunk_idx,
-                    num_query_chunks=num_chunks,
-                    layer_id=self.layer_id,
-                    query=None,  # Prefill typically doesn't use query for selection
-                    is_prefill=True,
-                    block_size=kvcache_manager.block_size,
-                    total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
-                )
-                cpu_block_table = sparse_policy.select_blocks(
-                    cpu_block_table, policy_ctx
-                )
+        # [DEBUG] Verify execution path
+        logger.debug(f"[DEBUG] Calling sparse_policy.compute_chunked_attention, "
+                     f"policy={sparse_policy}, layer={self.layer_id}, chunk={current_chunk_idx}")

-            if cpu_block_table:
-                # Get available load slots (all slots can be used since we use prefill buffer)
-                load_slots = list(range(offload_engine.num_ring_slots))
-                pipeline_depth = len(load_slots)
-
-                if pipeline_depth == 0:
-                    # Only 1 slot total, cannot pipeline - use sync loading
-                    o_acc, lse_acc = self._sync_load_previous_chunks(
-                        q_batched, cpu_block_table, offload_engine
-                    )
-                else:
-                    # Use ring buffer pipeline
-                    o_acc, lse_acc = self._ring_buffer_pipeline_load(
-                        q_batched, cpu_block_table, load_slots, offload_engine,
-                        current_chunk_idx
-                    )
-
-        # Get compute stream for all attention operations
-        compute_stream = offload_engine.compute_stream if offload_engine is not None else None
-
-        # Compute attention against current chunk's KV from prefill buffer (with causal mask)
-        if compute_stream is not None:
-            with torch.cuda.stream(compute_stream):
-                torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} CurrentChunk (causal)")
-                # Get KV from per-layer prefill buffer
-                k_batched, v_batched = offload_engine.get_prefill_buffer_slice(self.layer_id, num_tokens)
-                current_o, current_lse = flash_attn_with_lse(
-                    q_batched,
-                    k_batched,
-                    v_batched,
-                    softmax_scale=self.scale,
-                    causal=True,
-                )
-                torch.cuda.nvtx.range_pop()
-        else:
-            torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} CurrentChunk (causal)")
-            k_batched = k.unsqueeze(0)
-            v_batched = v.unsqueeze(0)
-            current_o, current_lse = flash_attn_with_lse(
-                q_batched,
-                k_batched,
-                v_batched,
-                softmax_scale=self.scale,
-                causal=True,
-            )
-            torch.cuda.nvtx.range_pop()
-
-        # Merge with accumulated (all on compute_stream for consistency)
-        if o_acc is None:
-            final_o = current_o
-        else:
-            if compute_stream is not None:
-                with torch.cuda.stream(compute_stream):
-                    torch.cuda.nvtx.range_push(f"MergeAttn: L{self.layer_id}")
-                    final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
-                    torch.cuda.nvtx.range_pop()
-            else:
-                torch.cuda.nvtx.range_push(f"MergeAttn: L{self.layer_id}")
-                final_o, _ = merge_attention_outputs(o_acc, lse_acc, current_o, current_lse)
-                torch.cuda.nvtx.range_pop()
+        # Delegate all computation to policy (no flash_attn or merge calls here!)
+        final_o = sparse_policy.compute_chunked_attention(
+            q, k, v,
+            self.layer_id,
+            self.scale,
+            offload_engine,
+            kvcache_manager,
+            current_chunk_idx,
+            seq,
+            num_tokens,
+        )

        torch.cuda.nvtx.range_pop()  # ChunkedPrefill

@@ -298,181 +227,7 @@ class Attention(nn.Module):
                    self.layer_id, cpu_block_id, num_tokens
                )

-        # Sync default stream with compute_stream before returning
-        # This ensures the result is ready for the rest of the model (layernorm, MLP)
-        if compute_stream is not None:
-            torch.cuda.default_stream().wait_stream(compute_stream)
-
-        # Remove batch dimension: [1, total_tokens, heads, dim] -> [total_tokens, heads, dim]
-        return final_o.squeeze(0)
-
-    def _sync_load_previous_chunks(
-        self,
-        q_batched: torch.Tensor,
-        cpu_block_table: list,
-        offload_engine,
-    ):
-        """Synchronous loading fallback when pipeline_depth=0."""
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
-
-        o_acc, lse_acc = None, None
-        compute_stream = offload_engine.compute_stream
-
-        for block_idx, cpu_block_id in enumerate(cpu_block_table):
-            # Load to slot 0 (single slot)
-            offload_engine.load_to_slot_layer(0, self.layer_id, cpu_block_id)
-            offload_engine.wait_slot_layer(0)
-
-            # IMPORTANT: Must use compute_stream to match wait_slot_layer
-            with torch.cuda.stream(compute_stream):
-                prev_k, prev_v = offload_engine.get_kv_for_slot(0)
-
-                prev_o, prev_lse = flash_attn_with_lse(
-                    q_batched, prev_k, prev_v,
-                    softmax_scale=self.scale,
-                    causal=False,
-                )
-
-                if o_acc is None:
-                    o_acc, lse_acc = prev_o, prev_lse
-                else:
-                    o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
-
-        return o_acc, lse_acc
-
-    def _ring_buffer_pipeline_load(
-        self,
-        q_batched: torch.Tensor,
-        cpu_block_table: list,
-        load_slots: list,
-        offload_engine,
-        current_chunk_idx: int = -1,
-    ):
-        """
-        Ring buffer async pipeline loading with double buffering.
-
-        Uses compute_done events to ensure safe buffer reuse:
-        - Before loading to slot X, wait for previous compute on slot X to finish
-        - Before computing on slot X, wait for load to slot X to finish
-
-        Timeline with 2 slots (A, B):
-        ┌──────────────┐
-        │ Load B0→A    │
-        └──────────────┘
-                       ┌──────────────┐ ┌──────────────┐
-                       │ Load B1→B    │ │ Load B2→A    │ ...
-                       └──────────────┘ └──────────────┘
-                                      ↘               ↘
-                        ┌──────────────┐ ┌──────────────┐
-                        │ Compute(A)   │ │ Compute(B)   │ ...
-                        └──────────────┘ └──────────────┘
-
-        The load_to_slot_layer internally waits for compute_done[slot] before
-        starting the transfer, ensuring no data race.
-        """
-        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
-
-        num_blocks = len(cpu_block_table)
-        if num_blocks == 0:
-            return None, None
-
-        pipeline_depth = len(load_slots)
-        if pipeline_depth == 0:
-            return None, None
-
-        o_acc, lse_acc = None, None
-
-        if pipeline_depth == 1:
-            # Only 1 slot available, cannot pipeline - use synchronous mode
-            # IMPORTANT: Must use compute_stream to match synchronization in
-            # load_to_slot_layer (waits for compute_done) and wait_slot_layer
-            slot = load_slots[0]
-            compute_stream = offload_engine.compute_stream
-            for block_idx in range(num_blocks):
-                cpu_block_id = cpu_block_table[block_idx]
-                offload_engine.load_to_slot_layer(slot, self.layer_id, cpu_block_id)
-                offload_engine.wait_slot_layer(slot)
-
-                with torch.cuda.stream(compute_stream):
-                    # Debug: call hooks on compute_stream (synchronized with transfer)
-                    if offload_engine.debug_mode:
-                        offload_engine._call_debug_hooks(slot, self.layer_id, cpu_block_id)
-
-                    prev_k, prev_v = offload_engine.get_kv_for_slot(slot)
-
-                    prev_o, prev_lse = flash_attn_with_lse(
-                        q_batched, prev_k, prev_v,
-                        softmax_scale=self.scale,
-                        causal=False,
-                    )
-                    # Record compute done so next load can safely reuse this slot
-                    offload_engine.record_slot_compute_done(slot)
-                    if o_acc is None:
-                        o_acc, lse_acc = prev_o, prev_lse
-                    else:
-                        o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
-            return o_acc, lse_acc
-
-        # N-way pipeline: use ALL available slots for maximum overlap
-        # Pipeline depth = num_slots - 1 (num_slots blocks in flight)
-        num_slots = len(load_slots)
-
-        # Phase 1: Pre-load up to num_slots blocks to fill the pipeline
-        # This starts all transfers in parallel, utilizing full PCIe bandwidth
-        num_preload = min(num_slots, num_blocks)
-        for i in range(num_preload):
-            offload_engine.load_to_slot_layer(load_slots[i], self.layer_id, cpu_block_table[i])
-
-        # Phase 2: Main loop - compute and immediately reuse slot for next transfer
-        # Use dedicated compute_stream (not default stream) to enable overlap with transfers
-        compute_stream = offload_engine.compute_stream
-
-        for block_idx in range(num_blocks):
-            torch.cuda.nvtx.range_push(f"PipelineBlock: L{self.layer_id} B{block_idx}")
-
-            # Cycle through slots: slot[block_idx % num_slots]
-            current_slot = load_slots[block_idx % num_slots]
-            cpu_block_id = cpu_block_table[block_idx]
-
-            # Wait for current slot's transfer to complete (on compute_stream)
-            offload_engine.wait_slot_layer(current_slot)
-
-            # Compute attention on current slot's data
-            # IMPORTANT: Use dedicated compute_stream to avoid implicit sync with default stream
-            with torch.cuda.stream(compute_stream):
-                # Debug: call hooks on compute_stream (synchronized with transfer)
-                if offload_engine.debug_mode:
-                    offload_engine._call_debug_hooks(current_slot, self.layer_id, cpu_block_id)
-
-                torch.cuda.nvtx.range_push(f"FlashAttn: L{self.layer_id} PrevBlock{block_idx}")
-                prev_k, prev_v = offload_engine.get_kv_for_slot(current_slot)
-
-                prev_o, prev_lse = flash_attn_with_lse(
-                    q_batched, prev_k, prev_v,
-                    softmax_scale=self.scale,
-                    causal=False,
-                )
-                torch.cuda.nvtx.range_pop()
-
-                # Record compute done - this allows the next transfer to safely overwrite this slot
-                offload_engine.record_slot_compute_done(current_slot)
-
-            # Immediately start loading the NEXT block into this slot (if more blocks remain)
-            # Key insight: reuse current_slot immediately after compute is done!
-            next_block_idx = block_idx + num_slots
-            if next_block_idx < num_blocks:
-                offload_engine.load_to_slot_layer(current_slot, self.layer_id, cpu_block_table[next_block_idx])
-
-            # Merge with accumulated (also on compute_stream for consistency)
-            with torch.cuda.stream(compute_stream):
-                if o_acc is None:
-                    o_acc, lse_acc = prev_o, prev_lse
-                else:
-                    o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
-
-            torch.cuda.nvtx.range_pop()  # PipelineBlock
-
-        return o_acc, lse_acc
+        return final_o

    def _chunked_decode_attention(
        self,
@@ -517,6 +272,8 @@ class Attention(nn.Module):
        if last_block_valid_tokens == 0 and total_prefill_tokens > 0:
            last_block_valid_tokens = block_size  # Last block was exactly full

+        offload_engine = kvcache_manager.offload_engine
+
        # Apply sparse policy if enabled (Quest does Top-K selection for decode)
        sparse_policy = kvcache_manager.sparse_policy
        if sparse_policy is not None:
@@ -530,11 +287,9 @@ class Attention(nn.Module):
                total_kv_len=len(cpu_block_table) * kvcache_manager.block_size,
            )
            cpu_block_table = sparse_policy.select_blocks(
-                cpu_block_table, policy_ctx
+                cpu_block_table, offload_engine, policy_ctx
            )

-        offload_engine = kvcache_manager.offload_engine
-
        # Use cross-layer pipeline if active (initialized in model_runner)
        if offload_engine.is_pipeline_active():
            o_acc, lse_acc = self._decode_with_layer_pipeline(