[WIP] Before add Quest policy.

2026-01-07 02:32:30 +08:00
parent f240903013
commit c99a6f3d3f
11 changed files with 166 additions and 191 deletions
--- a/nanovllm/kvcache/sparse/init.py
+++ b/nanovllm/kvcache/sparse/init.py
@@ -22,7 +22,6 @@ Usage:
 from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext, SparsePolicyType
 from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
 from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
-from nanovllm.kvcache.sparse.hybrid import HybridPolicy


 def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
@@ -67,6 +66,5 @@ __all__ = [
    "QuestPolicy",
    "QuestConfig",
    "BlockMetadataManager",
-    "HybridPolicy",
    "create_sparse_policy",
 ]
--- a/nanovllm/kvcache/sparse/hybrid.py
+++ b/nanovllm/kvcache/sparse/hybrid.py
@@ -1,93 +0,0 @@
-"""
-Hybrid sparse attention policy.
-
-Allows using different policies for prefill vs decode phases.
-This is useful because optimal sparsity patterns often differ:
- Prefill: fixed patterns work well (e.g., VerticalSlash)
- Decode: query-aware selection helps (e.g., Quest)
-"""
-
-from typing import List
-import torch
-from .policy import SparsePolicy, PolicyContext
-
-
-class HybridPolicy(SparsePolicy):
-    """
-    Hybrid policy that uses different policies for prefill and decode.
-
-    Example usage:
-    ```python
-    from nanovllm.kvcache.sparse import (
-        HybridPolicy, VerticalSlashPolicy, QuestPolicy,
-        VerticalSlashConfig, QuestConfig, BlockMetadataManager
-    )
-
-    # Prefill: use fast fixed pattern
-    prefill_policy = VerticalSlashPolicy(VerticalSlashConfig(
-        num_sink_blocks=1,
-        local_window_blocks=3,
-    ))
-
-    # Decode: use query-aware selection
-    metadata = BlockMetadataManager(num_blocks, num_layers, num_heads, head_dim)
-    decode_policy = QuestPolicy(QuestConfig(topk_blocks=8), metadata)
-
-    # Combine
-    policy = HybridPolicy(prefill_policy, decode_policy)
-    ```
-    """
-
-    def __init__(
-        self,
-        prefill_policy: SparsePolicy,
-        decode_policy: SparsePolicy,
-    ):
-        """
-        Initialize hybrid policy.
-
-        Args:
-            prefill_policy: Policy to use during prefill phase
-            decode_policy: Policy to use during decode phase
-        """
-        self.prefill_policy = prefill_policy
-        self.decode_policy = decode_policy
-
-    def select_blocks(
-        self,
-        available_blocks: List[int],
-        ctx: PolicyContext,
-    ) -> List[int]:
-        """Delegate to appropriate policy based on phase."""
-        if ctx.is_prefill:
-            return self.prefill_policy.select_blocks(available_blocks, ctx)
-        else:
-            return self.decode_policy.select_blocks(available_blocks, ctx)
-
-    def on_block_offloaded(
-        self,
-        cpu_block_id: int,
-        layer_id: int,
-        k_cache: torch.Tensor,
-        num_valid_tokens: int,
-    ) -> None:
-        """Forward to both policies (both may need metadata updates)."""
-        self.prefill_policy.on_block_offloaded(
-            cpu_block_id, layer_id, k_cache, num_valid_tokens
-        )
-        self.decode_policy.on_block_offloaded(
-            cpu_block_id, layer_id, k_cache, num_valid_tokens
-        )
-
-    def reset(self) -> None:
-        """Reset both policies."""
-        self.prefill_policy.reset()
-        self.decode_policy.reset()
-
-    def __repr__(self) -> str:
-        return (
-            f"HybridPolicy(\n"
-            f"  prefill={self.prefill_policy},\n"
-            f"  decode={self.decode_policy}\n"
-            f")"
-        )
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -134,7 +134,7 @@ class SparsePolicy(ABC):
        """
        pass

-    def on_block_offloaded(
+    def on_prefill_offload(
        self,
        cpu_block_id: int,
        layer_id: int,
@@ -142,15 +142,38 @@ class SparsePolicy(ABC):
        num_valid_tokens: int,
    ) -> None:
        """
-        Hook called when a block is offloaded from GPU to CPU.
+        Hook called when a block is offloaded during prefill phase.

+        Called BEFORE GPU→CPU copy, while k_cache is still on GPU.
        Override this to collect metadata about blocks (e.g., min/max keys
        for Quest-style selection). Default implementation does nothing.

        Args:
-            cpu_block_id: The CPU block ID that was written
+            cpu_block_id: The CPU block ID that will be written
            layer_id: Transformer layer index
-            k_cache: Key cache tensor [block_size, num_kv_heads, head_dim]
+            k_cache: Key cache tensor [block_size, num_kv_heads, head_dim] (on GPU)
+            num_valid_tokens: Number of valid tokens in this block
+        """
+        pass
+
+    def on_decode_offload(
+        self,
+        cpu_block_id: int,
+        layer_id: int,
+        k_cache: torch.Tensor,
+        num_valid_tokens: int,
+    ) -> None:
+        """
+        Hook called when a block is offloaded during decode phase.
+
+        Called BEFORE GPU→CPU copy, while k_cache is still on GPU.
+        Override this to update metadata about blocks. Default implementation
+        does nothing.
+
+        Args:
+            cpu_block_id: The CPU block ID that will be written
+            layer_id: Transformer layer index
+            k_cache: Key cache tensor [block_size, num_kv_heads, head_dim] (on GPU)
            num_valid_tokens: Number of valid tokens in this block
        """
        pass
--- a/nanovllm/kvcache/sparse/quest.py
+++ b/nanovllm/kvcache/sparse/quest.py
@@ -289,14 +289,25 @@ class QuestPolicy(SparsePolicy):

        return result

-    def on_block_offloaded(
+    def on_prefill_offload(
        self,
        cpu_block_id: int,
        layer_id: int,
        k_cache: torch.Tensor,
        num_valid_tokens: int,
    ) -> None:
-        """Update min/max key metadata when block is offloaded."""
+        """Update min/max key metadata during prefill offload."""
+        if self.metadata is not None:
+            self.metadata.update_metadata(cpu_block_id, layer_id, k_cache, num_valid_tokens)
+
+    def on_decode_offload(
+        self,
+        cpu_block_id: int,
+        layer_id: int,
+        k_cache: torch.Tensor,
+        num_valid_tokens: int,
+    ) -> None:
+        """Update min/max key metadata during decode offload (for new blocks)."""
        if self.metadata is not None:
            self.metadata.update_metadata(cpu_block_id, layer_id, k_cache, num_valid_tokens)