feat: add XAttention sparse policy integration

Integrate COMPASS XAttention algorithm into nano-vllm's CPU offload execution path. Uses FlashAttention with native GQA support for offload mode. New files: - nanovllm/kvcache/sparse/utils.py: find_blocks_chunked() utility - nanovllm/kvcache/sparse/kernels.py: Triton kernels for XAttention - nanovllm/kvcache/sparse/xattn.py: XAttentionPolicy implementation Modified: - nanovllm/config.py: Add XATTN configuration parameters - nanovllm/engine/model_runner.py: Support XATTN policy - nanovllm/kvcache/sparse/__init__.py: Register XAttentionPolicy - tests/test_ruler.py: Add --sparse-policy parameter Test results (32k ruler): - NIAH tasks: 12/12 (100%) - QA/Recall tasks: 11/15 (73%) - Overall: 23/27 (85%) Co-Authored-By: Claude <noreply@anthropic.com>
2026-01-14 10:04:46 +08:00
parent 029894118d
commit ac1ccbceaa
10 changed files with 1001 additions and 813 deletions
--- a/nanovllm/kvcache/sparse/init.py
+++ b/nanovllm/kvcache/sparse/init.py
@@ -24,6 +24,7 @@ from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
 from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
 from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager
 from nanovllm.kvcache.sparse.minference import MInferencePolicy
+from nanovllm.kvcache.sparse.xattn import XAttentionPolicy


 def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
@@ -65,6 +66,17 @@ def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolic
            num_recent_diags=kwargs.get("num_recent_diags", 100),
        )

+    elif policy_type == SparsePolicyType.XATTN:
+        return XAttentionPolicy(
+            stride=kwargs.get("stride", 8),
+            threshold=kwargs.get("threshold", 0.9),
+            chunk_size=kwargs.get("chunk_size", 16384),
+            use_triton=kwargs.get("use_triton", True),
+            keep_sink=kwargs.get("keep_sink", False),
+            keep_recent=kwargs.get("keep_recent", False),
+            norm=kwargs.get("norm", 1.0),
+        )
+
    else:
        raise ValueError(f"Unknown policy type: {policy_type}")

@@ -78,5 +90,6 @@ __all__ = [
    "QuestConfig",
    "BlockMetadataManager",
    "MInferencePolicy",
+    "XAttentionPolicy",
    "create_sparse_policy",
 ]