97 lines
3.1 KiB
Python
97 lines
3.1 KiB
Python
"""
|
|
KV Cache management module.
|
|
|
|
This module provides pluggable KV cache management strategies:
|
|
- GPUOnlyManager: Pure GPU (default, current nano-vllm behavior)
|
|
- HybridKVCacheManager: CPU-primary storage with GPU ring buffer for computation
|
|
|
|
Usage:
|
|
from nanovllm.kvcache import create_kvcache_manager
|
|
|
|
manager = create_kvcache_manager(config)
|
|
"""
|
|
|
|
from typing import TYPE_CHECKING
|
|
|
|
from nanovllm.kvcache.base_manager import KVCacheManager
|
|
from nanovllm.kvcache.gpu_manager import GPUOnlyManager
|
|
|
|
if TYPE_CHECKING:
|
|
from nanovllm.config import Config
|
|
|
|
|
|
def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
|
"""
|
|
Factory function to create the appropriate KV cache manager.
|
|
|
|
Decision logic:
|
|
1. If enable_cpu_offload=False: use GPUOnlyManager
|
|
2. If enable_cpu_offload=True but all blocks fit in GPU: use GPUOnlyManager
|
|
3. If enable_cpu_offload=True and need CPU blocks: use HybridKVCacheManager
|
|
|
|
Args:
|
|
config: Model configuration with offload settings
|
|
|
|
Returns:
|
|
KVCacheManager instance
|
|
"""
|
|
if not getattr(config, 'enable_cpu_offload', False):
|
|
# Default: pure GPU mode
|
|
return GPUOnlyManager(
|
|
num_blocks=config.num_kvcache_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
)
|
|
|
|
# CPU offload is enabled
|
|
num_gpu_blocks = config.num_gpu_kvcache_blocks
|
|
num_cpu_blocks = config.num_cpu_kvcache_blocks
|
|
|
|
if num_cpu_blocks <= 0:
|
|
# All blocks fit in GPU, use pure GPU mode
|
|
return GPUOnlyManager(
|
|
num_blocks=num_gpu_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
)
|
|
|
|
# Need CPU offload: use hybrid manager
|
|
from nanovllm.kvcache.hybrid_manager import HybridKVCacheManager
|
|
from nanovllm.kvcache.policies import get_policy
|
|
from nanovllm.kvcache.sparse import create_sparse_policy, SparsePolicyType
|
|
|
|
eviction_policy = get_policy(getattr(config, 'offload_policy', 'lru'))
|
|
|
|
# Create sparse policies from config
|
|
prefill_policy_type = getattr(config, 'prefill_policy', 'full')
|
|
decode_policy_type = getattr(config, 'decode_policy', 'full')
|
|
|
|
def create_policy(policy_type_str):
|
|
"""Create a sparse policy from config string."""
|
|
if policy_type_str.lower() == 'full':
|
|
return create_sparse_policy(SparsePolicyType.FULL)
|
|
policy_type = SparsePolicyType[policy_type_str.upper()]
|
|
return create_sparse_policy(
|
|
policy_type,
|
|
topk_blocks=getattr(config, 'sparse_topk_blocks', 8),
|
|
threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
|
|
include_sink_blocks=getattr(config, 'sparse_num_sink_blocks', 1),
|
|
)
|
|
|
|
prefill_policy = create_policy(prefill_policy_type)
|
|
decode_policy = create_policy(decode_policy_type)
|
|
|
|
return HybridKVCacheManager(
|
|
num_gpu_slots=num_gpu_blocks,
|
|
num_cpu_blocks=num_cpu_blocks,
|
|
block_size=config.kvcache_block_size,
|
|
policy=eviction_policy,
|
|
prefill_policy=prefill_policy,
|
|
decode_policy=decode_policy,
|
|
)
|
|
|
|
|
|
__all__ = [
|
|
"KVCacheManager",
|
|
"GPUOnlyManager",
|
|
"create_kvcache_manager",
|
|
]
|