feat: integrate sparse policy architecture into GPU-only mode

- Add compute_prefill() and compute_decode() GPU-only methods to SparsePolicy base class
- Implement GPU-only methods in FullAttentionPolicy using flash_attn
- Add sparse_policy parameter to GPUOnlyManager
- Update create_kvcache_manager() to create FullAttentionPolicy for GPU-only mode
- Route GPU-only attention through sparse_policy in attention.py
- Pass kvcache_manager to context for policy access
- Add --enable-policy flag to bench.py for testing
- Handle warmup phase when kvcache_manager is not yet allocated

This allows GPU-only mode to use the same policy architecture as CPU offload mode,
enabling future sparse attention implementations (Quest, XAttention) in GPU-only mode.

Performance verified: ~4890 tok/s (unchanged from baseline)

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-27 05:08:02 +08:00
parent 05ce57ee8e
commit 09b2136e9f
7 changed files with 287 additions and 25 deletions

View File

@@ -25,7 +25,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
Factory function to create the appropriate KV cache manager.
Decision logic:
1. If enable_cpu_offload=False: use GPUOnlyManager
1. If enable_cpu_offload=False: use GPUOnlyManager (optionally with sparse policy)
2. If enable_cpu_offload=True but all blocks fit in GPU: use GPUOnlyManager
3. If enable_cpu_offload=True and need CPU blocks: use HybridKVCacheManager
@@ -37,9 +37,44 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
"""
if not getattr(config, 'enable_cpu_offload', False):
# Default: pure GPU mode
# Check if sparse policy is requested for GPU-only mode
from nanovllm.config import SparsePolicyType
sparse_policy_type = getattr(config, 'sparse_policy', None)
# Handle None case - use FULL as default
if sparse_policy_type is None:
sparse_policy_type = SparsePolicyType.FULL
sparse_policy = None
if sparse_policy_type != SparsePolicyType.FULL:
# Create sparse policy for GPU-only mode
from nanovllm.kvcache.sparse import create_sparse_policy
policy_kwargs = {}
if sparse_policy_type == SparsePolicyType.QUEST:
policy_kwargs = {
'topk_blocks': getattr(config, 'sparse_topk_blocks', 8),
'threshold_blocks': getattr(config, 'sparse_threshold_blocks', 4),
}
elif sparse_policy_type == SparsePolicyType.XATTN_BSA:
policy_kwargs = {
'block_size': getattr(config, 'sparse_block_size', 128),
'samples_per_chunk': getattr(config, 'sparse_samples_per_chunk', 128),
'threshold': getattr(config, 'sparse_threshold', 0.9),
'use_triton': getattr(config, 'sparse_use_triton', True),
'stride': getattr(config, 'sparse_stride', 8),
'chunk_size': getattr(config, 'sparse_chunk_size', 16384),
}
sparse_policy = create_sparse_policy(sparse_policy_type, **policy_kwargs)
else:
# FULL policy for GPU-only mode - always create for consistent API
from nanovllm.kvcache.sparse import FullAttentionPolicy
sparse_policy = FullAttentionPolicy()
return GPUOnlyManager(
num_blocks=config.num_kvcache_blocks,
block_size=config.kvcache_block_size,
sparse_policy=sparse_policy,
)
# CPU offload is enabled