[feat] Added sparse KVcache feature, NEED VERIFY.

2025-12-22 08:51:02 +08:00
parent 8df0c7517b
commit 051f2295c9
14 changed files with 1215 additions and 12 deletions
--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -0,0 +1,34 @@
+"""
+Full attention policy - loads all blocks (no sparsity).
+
+This serves as a baseline and default policy when sparse
+attention is not needed.
+"""
+
+from typing import List
+from .policy import SparsePolicy, PolicyContext
+
+
+class FullAttentionPolicy(SparsePolicy):
+    """
+    Full attention policy that loads all available blocks.
+
+    This is the default behavior with no sparsity - all previous
+    KV cache blocks are loaded for each query chunk.
+
+    Use this as:
+    - A baseline for comparing sparse policies
+    - When you need full attention accuracy
+    - For short sequences where sparsity isn't beneficial
+    """
+
+    def select_blocks(
+        self,
+        available_blocks: List[int],
+        ctx: PolicyContext,
+    ) -> List[int]:
+        """Return all blocks - no sparsity."""
+        return available_blocks
+
+    def __repr__(self) -> str:
+        return "FullAttentionPolicy()"