✨ feat: integrate sparse policy architecture into GPU-only mode

- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class - Implement GPU-only methods in FullAttentionPolicy using flash_attn - Add sparse_policy parameter to GPUOnlyManager - Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode - Route GPU-only attention through sparse_policy in attention.py - Pass kvcache_manager to context for policy access - Add --enable-policy flag to bench.py for testing - Handle warmup phase when kvcache_manager is not yet allocated This allows GPU-only mode to use the same policy architecture as CPU offload mode, enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode. Performance verified: ~4890 tok/s (unchanged from baseline) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
2026-01-27 05:08:02 +08:00
parent 05ce57ee8e
commit 09b2136e9f
7 changed files with 287 additions and 25 deletions
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -25,7 +25,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    Factory function to create the appropriate KV cache manager.

    Decision logic:
-    1. If enable_cpu_offload=False: use GPUOnlyManager
+    1. If enable_cpu_offload=False: use GPUOnlyManager (optionally with sparse policy)
    2. If enable_cpu_offload=True but all blocks fit in GPU: use GPUOnlyManager
    3. If enable_cpu_offload=True and need CPU blocks: use HybridKVCacheManager

@@ -37,9 +37,44 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    """
    if not getattr(config, 'enable_cpu_offload', False):
        # Default: pure GPU mode
+        # Check if sparse policy is requested for GPU-only mode
+        from nanovllm.config import SparsePolicyType
+        sparse_policy_type = getattr(config, 'sparse_policy', None)
+        # Handle None case - use FULL as default
+        if sparse_policy_type is None:
+            sparse_policy_type = SparsePolicyType.FULL
+
+        sparse_policy = None
+        if sparse_policy_type != SparsePolicyType.FULL:
+            # Create sparse policy for GPU-only mode
+            from nanovllm.kvcache.sparse import create_sparse_policy
+
+            policy_kwargs = {}
+            if sparse_policy_type == SparsePolicyType.QUEST:
+                policy_kwargs = {
+                    'topk_blocks': getattr(config, 'sparse_topk_blocks', 8),
+                    'threshold_blocks': getattr(config, 'sparse_threshold_blocks', 4),
+                }
+            elif sparse_policy_type == SparsePolicyType.XATTN_BSA:
+                policy_kwargs = {
+                    'block_size': getattr(config, 'sparse_block_size', 128),
+                    'samples_per_chunk': getattr(config, 'sparse_samples_per_chunk', 128),
+                    'threshold': getattr(config, 'sparse_threshold', 0.9),
+                    'use_triton': getattr(config, 'sparse_use_triton', True),
+                    'stride': getattr(config, 'sparse_stride', 8),
+                    'chunk_size': getattr(config, 'sparse_chunk_size', 16384),
+                }
+
+            sparse_policy = create_sparse_policy(sparse_policy_type, **policy_kwargs)
+        else:
+            # FULL policy for GPU-only mode - always create for consistent API
+            from nanovllm.kvcache.sparse import FullAttentionPolicy
+            sparse_policy = FullAttentionPolicy()
+
        return GPUOnlyManager(
            num_blocks=config.num_kvcache_blocks,
            block_size=config.kvcache_block_size,
+            sparse_policy=sparse_policy,
        )

    # CPU offload is enabled