✨ feat: integrate sparse policy architecture into GPU-only mode

- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class - Implement GPU-only methods in FullAttentionPolicy using flash_attn - Add sparse_policy parameter to GPUOnlyManager - Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode - Route GPU-only attention through sparse_policy in attention.py - Pass kvcache_manager to context for policy access - Add --enable-policy flag to bench.py for testing - Handle warmup phase when kvcache_manager is not yet allocated This allows GPU-only mode to use the same policy architecture as CPU offload mode, enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode. Performance verified: ~4890 tok/s (unchanged from baseline) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
2026-01-27 05:08:02 +08:00
parent 05ce57ee8e
commit 09b2136e9f
7 changed files with 287 additions and 25 deletions
--- a/nanovllm/kvcache/gpu_manager.py
+++ b/nanovllm/kvcache/gpu_manager.py
@@ -7,13 +7,16 @@ the KVCacheManager interface.
 """

 from collections import deque
-from typing import List, Tuple, Dict, Optional
+from typing import List, Tuple, Dict, Optional, TYPE_CHECKING
 import torch
 from torch import Tensor

 from nanovllm.engine.sequence import Sequence
 from nanovllm.kvcache.base_manager import KVCacheManager

+if TYPE_CHECKING:
+    from nanovllm.kvcache.sparse.policy import SparsePolicy
+

 class Block:
    """Physical block in GPU memory."""
@@ -50,17 +53,28 @@ class GPUOnlyManager(KVCacheManager):
    all data stays on GPU at fixed addresses.
    """

-    def __init__(self, num_blocks: int, block_size: int):
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        sparse_policy: Optional["SparsePolicy"] = None,
+    ):
        """
        Initialize GPU-only manager.

        Args:
            num_blocks: Total number of blocks to manage
            block_size: Tokens per block (default 256)
+            sparse_policy: Optional sparse attention policy for GPU-only mode
        """
        self._block_size = block_size
        self._num_blocks = num_blocks

+        # Sparse policy for GPU-only mode (optional)
+        self.sparse_policy = sparse_policy
+        # No offload engine in GPU-only mode
+        self.offload_engine = None
+
        # Block metadata
        self.blocks: List[Block] = [Block(i) for i in range(num_blocks)]