✨ feat: integrate sparse policy architecture into GPU-only mode

- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class - Implement GPU-only methods in FullAttentionPolicy using flash_attn - Add sparse_policy parameter to GPUOnlyManager - Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode - Route GPU-only attention through sparse_policy in attention.py - Pass kvcache_manager to context for policy access - Add --enable-policy flag to bench.py for testing - Handle warmup phase when kvcache_manager is not yet allocated This allows GPU-only mode to use the same policy architecture as CPU offload mode, enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode. Performance verified: ~4890 tok/s (unchanged from baseline) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
2026-01-27 05:08:02 +08:00
parent 05ce57ee8e
commit 09b2136e9f
7 changed files with 287 additions and 25 deletions
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -25,7 +25,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    Factory function to create the appropriate KV cache manager.

    Decision logic:
-    1. If enable_cpu_offload=False: use GPUOnlyManager
+    1. If enable_cpu_offload=False: use GPUOnlyManager (optionally with sparse policy)
    2. If enable_cpu_offload=True but all blocks fit in GPU: use GPUOnlyManager
    3. If enable_cpu_offload=True and need CPU blocks: use HybridKVCacheManager

@@ -37,9 +37,44 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    """
    if not getattr(config, 'enable_cpu_offload', False):
        # Default: pure GPU mode
+        # Check if sparse policy is requested for GPU-only mode
+        from nanovllm.config import SparsePolicyType
+        sparse_policy_type = getattr(config, 'sparse_policy', None)
+        # Handle None case - use FULL as default
+        if sparse_policy_type is None:
+            sparse_policy_type = SparsePolicyType.FULL
+
+        sparse_policy = None
+        if sparse_policy_type != SparsePolicyType.FULL:
+            # Create sparse policy for GPU-only mode
+            from nanovllm.kvcache.sparse import create_sparse_policy
+
+            policy_kwargs = {}
+            if sparse_policy_type == SparsePolicyType.QUEST:
+                policy_kwargs = {
+                    'topk_blocks': getattr(config, 'sparse_topk_blocks', 8),
+                    'threshold_blocks': getattr(config, 'sparse_threshold_blocks', 4),
+                }
+            elif sparse_policy_type == SparsePolicyType.XATTN_BSA:
+                policy_kwargs = {
+                    'block_size': getattr(config, 'sparse_block_size', 128),
+                    'samples_per_chunk': getattr(config, 'sparse_samples_per_chunk', 128),
+                    'threshold': getattr(config, 'sparse_threshold', 0.9),
+                    'use_triton': getattr(config, 'sparse_use_triton', True),
+                    'stride': getattr(config, 'sparse_stride', 8),
+                    'chunk_size': getattr(config, 'sparse_chunk_size', 16384),
+                }
+
+            sparse_policy = create_sparse_policy(sparse_policy_type, **policy_kwargs)
+        else:
+            # FULL policy for GPU-only mode - always create for consistent API
+            from nanovllm.kvcache.sparse import FullAttentionPolicy
+            sparse_policy = FullAttentionPolicy()
+
        return GPUOnlyManager(
            num_blocks=config.num_kvcache_blocks,
            block_size=config.kvcache_block_size,
+            sparse_policy=sparse_policy,
        )

    # CPU offload is enabled
--- a/nanovllm/kvcache/gpu_manager.py
+++ b/nanovllm/kvcache/gpu_manager.py
@@ -7,13 +7,16 @@ the KVCacheManager interface.
 """

 from collections import deque
-from typing import List, Tuple, Dict, Optional
+from typing import List, Tuple, Dict, Optional, TYPE_CHECKING
 import torch
 from torch import Tensor

 from nanovllm.engine.sequence import Sequence
 from nanovllm.kvcache.base_manager import KVCacheManager

+if TYPE_CHECKING:
+    from nanovllm.kvcache.sparse.policy import SparsePolicy
+

 class Block:
    """Physical block in GPU memory."""
@@ -50,17 +53,28 @@ class GPUOnlyManager(KVCacheManager):
    all data stays on GPU at fixed addresses.
    """

-    def __init__(self, num_blocks: int, block_size: int):
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        sparse_policy: Optional["SparsePolicy"] = None,
+    ):
        """
        Initialize GPU-only manager.

        Args:
            num_blocks: Total number of blocks to manage
            block_size: Tokens per block (default 256)
+            sparse_policy: Optional sparse attention policy for GPU-only mode
        """
        self._block_size = block_size
        self._num_blocks = num_blocks

+        # Sparse policy for GPU-only mode (optional)
+        self.sparse_policy = sparse_policy
+        # No offload engine in GPU-only mode
+        self.offload_engine = None
+
        # Block metadata
        self.blocks: List[Block] = [Block(i) for i in range(num_blocks)]

--- a/nanovllm/kvcache/sparse/full_policy.py
+++ b/nanovllm/kvcache/sparse/full_policy.py
@@ -76,6 +76,75 @@ class FullAttentionPolicy(SparsePolicy):
        logger.info(f"[Full Policy] Density Stats: chunks={stats['num_chunks']}, "
                   f"blocks={stats['total_available_blocks']}, density=100.0%")

+    # =========================================================================
+    # GPU-only methods (non-chunked)
+    # =========================================================================
+
+    def compute_prefill(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        GPU-only prefill attention using flash_attn_varlen_func.
+
+        This is the simplest implementation - just call flash attention directly.
+        For sparse policies, this method would implement block selection.
+        """
+        from flash_attn import flash_attn_varlen_func
+
+        return flash_attn_varlen_func(
+            q, k, v,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_q,
+            max_seqlen_k=max_seqlen_k,
+            softmax_scale=softmax_scale,
+            causal=True,
+            block_table=block_tables,
+        )
+
+    def compute_decode(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        GPU-only decode attention using flash_attn_with_kvcache.
+
+        This is the simplest implementation - just call flash attention directly.
+        For sparse policies, this method would implement block selection.
+        """
+        from flash_attn import flash_attn_with_kvcache
+
+        # q is [batch, num_heads, head_dim], need to add seq dim
+        return flash_attn_with_kvcache(
+            q.unsqueeze(1),  # [batch, 1, heads, dim]
+            k_cache,
+            v_cache,
+            cache_seqlens=cache_seqlens,
+            block_table=block_tables,
+            softmax_scale=softmax_scale,
+            causal=True,
+        )
+
+    # =========================================================================
+    # Chunked offload methods
+    # =========================================================================
+
    def compute_chunked_prefill(
        self,
        q: torch.Tensor,
--- a/nanovllm/kvcache/sparse/policy.py
+++ b/nanovllm/kvcache/sparse/policy.py
@@ -191,6 +191,87 @@ class SparsePolicy(ABC):
        """
        pass

+    # =========================================================================
+    # GPU-only methods (non-chunked)
+    # These methods are used when all KV cache is on GPU, no CPU offload needed.
+    # =========================================================================
+
+    def compute_prefill(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Compute GPU-only prefill attention (non-chunked).
+
+        This method is used when all KV cache resides on GPU (no CPU offload).
+        Override this to implement sparse prefill attention for GPU-only mode.
+        Default implementation raises NotImplementedError.
+
+        Args:
+            q: [total_q, num_heads, head_dim] query tensor (packed variable length)
+            k: [total_kv, num_kv_heads, head_dim] key tensor
+            v: [total_kv, num_kv_heads, head_dim] value tensor
+            cu_seqlens_q: [batch+1] cumulative sequence lengths for queries
+            cu_seqlens_k: [batch+1] cumulative sequence lengths for keys
+            max_seqlen_q: maximum query sequence length
+            max_seqlen_k: maximum key sequence length
+            softmax_scale: softmax scaling factor (1/sqrt(head_dim))
+            layer_id: transformer layer index
+            block_tables: [batch, max_blocks] paged attention block tables (optional)
+
+        Returns:
+            [total_q, num_heads, head_dim] attention output
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement compute_prefill for GPU-only mode"
+        )
+
+    def compute_decode(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Compute GPU-only decode attention (non-chunked).
+
+        This method is used when all KV cache resides on GPU (no CPU offload).
+        Override this to implement sparse decode attention for GPU-only mode.
+        Default implementation raises NotImplementedError.
+
+        Args:
+            q: [batch, num_heads, head_dim] query tensor (single token per sequence)
+            k_cache: [num_blocks, block_size, num_kv_heads, head_dim] paged key cache
+            v_cache: [num_blocks, block_size, num_kv_heads, head_dim] paged value cache
+            cache_seqlens: [batch] sequence lengths in cache
+            softmax_scale: softmax scaling factor (1/sqrt(head_dim))
+            layer_id: transformer layer index
+            block_tables: [batch, max_blocks] paged attention block tables (optional)
+
+        Returns:
+            [batch, 1, num_heads, head_dim] attention output
+        """
+        raise NotImplementedError(
+            f"{self.__class__.__name__} does not implement compute_decode for GPU-only mode"
+        )
+
+    # =========================================================================
+    # Chunked offload methods (for CPU offload mode)
+    # =========================================================================
+
    @abstractmethod
    def compute_chunked_prefill(
        self,