✨ feat: integrate sparse policy architecture into GPU-only mode
- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class - Implement GPU-only methods in FullAttentionPolicy using flash_attn - Add sparse_policy parameter to GPUOnlyManager - Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode - Route GPU-only attention through sparse_policy in attention.py - Pass kvcache_manager to context for policy access - Add --enable-policy flag to bench.py for testing - Handle warmup phase when kvcache_manager is not yet allocated This allows GPU-only mode to use the same policy architecture as CPU offload mode, enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode. Performance verified: ~4890 tok/s (unchanged from baseline) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
@@ -7,13 +7,16 @@ the KVCacheManager interface.
|
||||
"""
|
||||
|
||||
from collections import deque
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from typing import List, Tuple, Dict, Optional, TYPE_CHECKING
|
||||
import torch
|
||||
from torch import Tensor
|
||||
|
||||
from nanovllm.engine.sequence import Sequence
|
||||
from nanovllm.kvcache.base_manager import KVCacheManager
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from nanovllm.kvcache.sparse.policy import SparsePolicy
|
||||
|
||||
|
||||
class Block:
|
||||
"""Physical block in GPU memory."""
|
||||
@@ -50,17 +53,28 @@ class GPUOnlyManager(KVCacheManager):
|
||||
all data stays on GPU at fixed addresses.
|
||||
"""
|
||||
|
||||
def __init__(self, num_blocks: int, block_size: int):
|
||||
def __init__(
|
||||
self,
|
||||
num_blocks: int,
|
||||
block_size: int,
|
||||
sparse_policy: Optional["SparsePolicy"] = None,
|
||||
):
|
||||
"""
|
||||
Initialize GPU-only manager.
|
||||
|
||||
Args:
|
||||
num_blocks: Total number of blocks to manage
|
||||
block_size: Tokens per block (default 256)
|
||||
sparse_policy: Optional sparse attention policy for GPU-only mode
|
||||
"""
|
||||
self._block_size = block_size
|
||||
self._num_blocks = num_blocks
|
||||
|
||||
# Sparse policy for GPU-only mode (optional)
|
||||
self.sparse_policy = sparse_policy
|
||||
# No offload engine in GPU-only mode
|
||||
self.offload_engine = None
|
||||
|
||||
# Block metadata
|
||||
self.blocks: List[Block] = [Block(i) for i in range(num_blocks)]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user