feat: integrate sparse policy architecture into GPU-only mode

- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class
- Implement GPU-only methods in FullAttentionPolicy using flash_attn
- Add sparse_policy parameter to GPUOnlyManager
- Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode
- Route GPU-only attention through sparse_policy in attention.py
- Pass kvcache_manager to context for policy access
- Add --enable-policy flag to bench.py for testing
- Handle warmup phase when kvcache_manager is not yet allocated

This allows GPU-only mode to use the same policy architecture as CPU offload mode,
enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode.

Performance verified: ~4890 tok/s (unchanged from baseline)

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-27 05:08:02 +08:00
parent 05ce57ee8e
commit 09b2136e9f
7 changed files with 287 additions and 25 deletions

View File

@@ -7,13 +7,16 @@ the KVCacheManager interface.
"""
from collections import deque
from typing import List, Tuple, Dict, Optional
from typing import List, Tuple, Dict, Optional, TYPE_CHECKING
import torch
from torch import Tensor
from nanovllm.engine.sequence import Sequence
from nanovllm.kvcache.base_manager import KVCacheManager
if TYPE_CHECKING:
from nanovllm.kvcache.sparse.policy import SparsePolicy
class Block:
"""Physical block in GPU memory."""
@@ -50,17 +53,28 @@ class GPUOnlyManager(KVCacheManager):
all data stays on GPU at fixed addresses.
"""
def __init__(self, num_blocks: int, block_size: int):
def __init__(
self,
num_blocks: int,
block_size: int,
sparse_policy: Optional["SparsePolicy"] = None,
):
"""
Initialize GPU-only manager.
Args:
num_blocks: Total number of blocks to manage
block_size: Tokens per block (default 256)
sparse_policy: Optional sparse attention policy for GPU-only mode
"""
self._block_size = block_size
self._num_blocks = num_blocks
# Sparse policy for GPU-only mode (optional)
self.sparse_policy = sparse_policy
# No offload engine in GPU-only mode
self.offload_engine = None
# Block metadata
self.blocks: List[Block] = [Block(i) for i in range(num_blocks)]