[feat] Added sparse KVcache feature, NEED VERIFY.

2025-12-22 08:51:02 +08:00
parent 8df0c7517b
commit 051f2295c9
14 changed files with 1215 additions and 12 deletions
--- a/nanovllm/kvcache/hybrid_manager.py
+++ b/nanovllm/kvcache/hybrid_manager.py
@@ -25,6 +25,11 @@ from nanovllm.kvcache.offload_engine import OffloadEngine
 from nanovllm.kvcache.policies.base_policy import EvictionPolicy
 from nanovllm.kvcache.policies.lru_policy import LRUPolicy

+# Type checking import for sparse policy
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from nanovllm.kvcache.sparse.policy import SparsePolicy
+

 class BlockLocation(Enum):
    """Where a logical block's data currently resides."""
@@ -142,6 +147,9 @@ class HybridKVCacheManager(KVCacheManager):
        # Key: sequence id, Value: starting position where decode began in current block
        self._decode_start_pos: Dict[int, int] = {}

+        # Sparse attention policy (optional)
+        self.sparse_policy: Optional["SparsePolicy"] = None
+
    @property
    def block_size(self) -> int:
        return self._block_size
@@ -174,6 +182,24 @@ class HybridKVCacheManager(KVCacheManager):
        assert self.offload_engine is not None
        return self.offload_engine.get_layer_cache(layer_id)

+    def set_sparse_policy(self, policy: "SparsePolicy") -> None:
+        """
+        Set sparse attention policy for block selection.
+
+        The sparse policy determines which KV blocks to load from CPU
+        for each query chunk during chunked attention computation.
+
+        Args:
+            policy: SparsePolicy instance (e.g., VerticalSlashPolicy, QuestPolicy)
+
+        Example:
+            from nanovllm.kvcache.sparse import VerticalSlashPolicy, VerticalSlashConfig
+            policy = VerticalSlashPolicy(VerticalSlashConfig(num_sink_blocks=2))
+            manager.set_sparse_policy(policy)
+        """
+        self.sparse_policy = policy
+        logger.info(f"Sparse attention policy set: {policy}")
+
    def _allocate_gpu_slot(self, protected_logical_ids: Optional[Set[int]] = None) -> int:
        """
        Get a free GPU slot, evicting if necessary.