[feat] Added sparse KVcache feature, NEED VERIFY.

This commit is contained in:
Zijie Tian
2025-12-22 08:51:02 +08:00
parent 8df0c7517b
commit 051f2295c9
14 changed files with 1215 additions and 12 deletions

View File

@@ -25,6 +25,11 @@ from nanovllm.kvcache.offload_engine import OffloadEngine
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
from nanovllm.kvcache.policies.lru_policy import LRUPolicy
# Type checking import for sparse policy
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from nanovllm.kvcache.sparse.policy import SparsePolicy
class BlockLocation(Enum):
"""Where a logical block's data currently resides."""
@@ -142,6 +147,9 @@ class HybridKVCacheManager(KVCacheManager):
# Key: sequence id, Value: starting position where decode began in current block
self._decode_start_pos: Dict[int, int] = {}
# Sparse attention policy (optional)
self.sparse_policy: Optional["SparsePolicy"] = None
@property
def block_size(self) -> int:
return self._block_size
@@ -174,6 +182,24 @@ class HybridKVCacheManager(KVCacheManager):
assert self.offload_engine is not None
return self.offload_engine.get_layer_cache(layer_id)
def set_sparse_policy(self, policy: "SparsePolicy") -> None:
"""
Set sparse attention policy for block selection.
The sparse policy determines which KV blocks to load from CPU
for each query chunk during chunked attention computation.
Args:
policy: SparsePolicy instance (e.g., VerticalSlashPolicy, QuestPolicy)
Example:
from nanovllm.kvcache.sparse import VerticalSlashPolicy, VerticalSlashConfig
policy = VerticalSlashPolicy(VerticalSlashConfig(num_sink_blocks=2))
manager.set_sparse_policy(policy)
"""
self.sparse_policy = policy
logger.info(f"Sparse attention policy set: {policy}")
def _allocate_gpu_slot(self, protected_logical_ids: Optional[Set[int]] = None) -> int:
"""
Get a free GPU slot, evicting if necessary.