[WIP] remove num_prefetch_blocks varible.
This commit is contained in:
@@ -22,7 +22,6 @@ class Config:
|
||||
offload_policy: str = "lru" # "lru", "fifo", or full class path
|
||||
num_transfer_streams: int = 4 # Number of CUDA streams for async transfers
|
||||
num_gpu_blocks: int = -1 # User-specified GPU blocks count, -1 = auto (use max available)
|
||||
num_prefetch_blocks: int = 2 # Number of prefetch blocks for three-region GPU buffer design
|
||||
|
||||
# Computed fields for offload (set in __post_init__ or by ModelRunner)
|
||||
num_gpu_kvcache_blocks: int = -1
|
||||
|
||||
@@ -58,14 +58,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
||||
from nanovllm.kvcache.policies import get_policy
|
||||
|
||||
policy = get_policy(getattr(config, 'offload_policy', 'lru'))
|
||||
num_prefetch_blocks = getattr(config, 'num_prefetch_blocks', 2)
|
||||
|
||||
return HybridKVCacheManager(
|
||||
num_gpu_slots=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
block_size=config.kvcache_block_size,
|
||||
policy=policy,
|
||||
num_prefetch_blocks=num_prefetch_blocks,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -86,7 +86,6 @@ class HybridKVCacheManager(KVCacheManager):
|
||||
num_cpu_blocks: int,
|
||||
block_size: int,
|
||||
policy: Optional[EvictionPolicy] = None,
|
||||
num_prefetch_blocks: int = 2,
|
||||
):
|
||||
"""
|
||||
Initialize hybrid manager with CPU-primary ring buffer design.
|
||||
@@ -99,13 +98,11 @@ class HybridKVCacheManager(KVCacheManager):
|
||||
num_cpu_blocks: Number of CPU pool blocks (primary storage)
|
||||
block_size: Tokens per block
|
||||
policy: Eviction policy (default: LRU, used for prefix cache management)
|
||||
num_prefetch_blocks: Number of blocks for ring buffer pipeline (deprecated, ring_slots = num_gpu_slots)
|
||||
"""
|
||||
self._block_size = block_size
|
||||
self.num_gpu_slots = num_gpu_slots
|
||||
self.num_cpu_blocks = num_cpu_blocks
|
||||
self.total_blocks = num_gpu_slots + num_cpu_blocks
|
||||
self.num_prefetch_blocks = num_prefetch_blocks # Ring buffer design parameter (deprecated)
|
||||
|
||||
# Eviction policy
|
||||
self.policy = policy or LRUPolicy()
|
||||
@@ -170,7 +167,6 @@ class HybridKVCacheManager(KVCacheManager):
|
||||
num_kv_heads=num_kv_heads,
|
||||
head_dim=head_dim,
|
||||
dtype=dtype,
|
||||
num_prefetch_blocks=self.num_prefetch_blocks,
|
||||
)
|
||||
|
||||
def get_layer_cache(self, layer_id: int) -> Tuple[Tensor, Tensor]:
|
||||
|
||||
@@ -53,7 +53,6 @@ class OffloadEngine:
|
||||
head_dim: int,
|
||||
dtype: torch.dtype = torch.float16,
|
||||
num_streams: int = 4,
|
||||
num_prefetch_blocks: int = 2,
|
||||
):
|
||||
self.num_layers = num_layers
|
||||
self.num_gpu_blocks = num_gpu_blocks
|
||||
@@ -82,8 +81,6 @@ class OffloadEngine:
|
||||
self.decode_load_slots = list(range(1, num_gpu_blocks))
|
||||
self.num_decode_load_slots = len(self.decode_load_slots)
|
||||
|
||||
# Keep num_prefetch_blocks for compatibility (used as chunk size for loading)
|
||||
self.num_prefetch_blocks = num_prefetch_blocks
|
||||
self.num_gpu_slots = num_gpu_blocks # alias
|
||||
|
||||
logger.info(f"Unified Ring Buffer: {self.num_ring_slots} slots total")
|
||||
|
||||
@@ -378,9 +378,9 @@ class Attention(nn.Module):
|
||||
|
||||
offload_engine = kvcache_manager.offload_engine
|
||||
|
||||
# Use prefetch_size as chunk size for double buffering
|
||||
# This ensures both Compute and Prefetch regions can hold a full chunk
|
||||
chunk_size = offload_engine.num_prefetch_blocks
|
||||
# Chunk size = capacity of each double buffer region (compute/prefetch)
|
||||
# Each region uses half of decode_load_slots
|
||||
chunk_size = max(1, len(offload_engine.decode_load_slots) // 2)
|
||||
num_chunks = (len(cpu_block_table) + chunk_size - 1) // chunk_size
|
||||
|
||||
o_acc = None
|
||||
|
||||
Reference in New Issue
Block a user