96 lines
3.2 KiB
Python
96 lines
3.2 KiB
Python
"""
|
|
KV Cache management module.
|
|
|
|
This module provides pluggable KV cache management strategies:
|
|
- GPUOnlyManager: Pure GPU (default, current nano-vllm behavior)
|
|
- HybridKVCacheManager: CPU-primary storage with GPU ring buffer for computation
|
|
|
|
Usage:
|
|
from nanovllm.kvcache import create_kvcache_manager
|
|
|
|
manager = create_kvcache_manager(config)
|
|
"""
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from nanovllm.kvcache.base_manager import KVCacheManager
|
|
from nanovllm.kvcache.gpu_manager import GPUOnlyManager
|
|
|
|
if TYPE_CHECKING:
|
|
from nanovllm.config import Config
|
|
|
|
|
|
def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
|
"""
|
|
Factory function to create the appropriate KV cache manager.
|
|
|
|
Decision logic:
|
|
1. If enable_cpu_offload=False: use GPUOnlyManager
|
|
2. If enable_cpu_offload=True but all blocks fit in GPU: use GPUOnlyManager
|
|
3. If enable_cpu_offload=True and need CPU blocks: use HybridKVCacheManager
|
|
|
|
Args:
|
|
config: Model configuration with offload settings
|
|
|
|
Returns:
|
|
KVCacheManager instance
|
|
"""
|
|
if not getattr(config, 'enable_cpu_offload', False):
|
|
# Default: pure GPU mode with contiguous cache for single-seq optimization
|
|
return GPUOnlyManager(
|
|
num_blocks=config.num_kvcache_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
max_seq_len=config.max_model_len, # Enable contiguous cache
|
|
)
|
|
|
|
# CPU offload is enabled
|
|
num_gpu_blocks = config.num_gpu_kvcache_blocks
|
|
num_cpu_blocks = config.num_cpu_kvcache_blocks
|
|
|
|
if num_cpu_blocks <= 0:
|
|
# All blocks fit in GPU, use pure GPU mode
|
|
return GPUOnlyManager(
|
|
num_blocks=num_gpu_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
)
|
|
|
|
# Need CPU offload: use hybrid manager
|
|
from nanovllm.kvcache.hybrid_manager import HybridKVCacheManager
|
|
from nanovllm.kvcache.policies import get_policy
|
|
from nanovllm.kvcache.sparse import create_sparse_policy
|
|
from nanovllm.config import SparsePolicyType
|
|
|
|
eviction_policy = get_policy(getattr(config, 'offload_policy', 'lru'))
|
|
|
|
# Create sparse policy from config enum
|
|
# Quest is decode-only: prefill returns all blocks (query=None), decode does Top-K
|
|
sparse_policy_type = getattr(config, 'sparse_policy', SparsePolicyType.FULL)
|
|
sparse_policy = create_sparse_policy(
|
|
sparse_policy_type,
|
|
topk_blocks=getattr(config, 'sparse_topk_blocks', 8),
|
|
threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
|
|
)
|
|
|
|
# max_seq_len needs to be larger than max_model_len to accommodate decode tokens
|
|
# When prefill uses ~max_model_len tokens, decode needs additional slots
|
|
# Add max_new_tokens (default 512) buffer for decode phase
|
|
max_new_tokens = getattr(config, 'max_new_tokens', 512)
|
|
max_seq_len = config.max_model_len + max_new_tokens
|
|
|
|
return HybridKVCacheManager(
|
|
num_gpu_slots=num_gpu_blocks,
|
|
num_cpu_blocks=num_cpu_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
policy=eviction_policy,
|
|
sparse_policy=sparse_policy,
|
|
num_kv_buffers=getattr(config, 'num_kv_buffers', 4),
|
|
max_seq_len=max_seq_len,
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
"KVCacheManager",
|
|
"GPUOnlyManager",
|
|
"create_kvcache_manager",
|
|
]
|