[WIP] remove num_prefetch_blocks varible.

This commit is contained in:
Zijie Tian
2025-12-24 18:22:26 +08:00
parent b264de903d
commit 782437c486
10 changed files with 465 additions and 18 deletions

View File

@@ -22,7 +22,6 @@ class Config:
offload_policy: str = "lru" # "lru", "fifo", or full class path
num_transfer_streams: int = 4 # Number of CUDA streams for async transfers
num_gpu_blocks: int = -1 # User-specified GPU blocks count, -1 = auto (use max available)
num_prefetch_blocks: int = 2 # Number of prefetch blocks for three-region GPU buffer design
# Computed fields for offload (set in __post_init__ or by ModelRunner)
num_gpu_kvcache_blocks: int = -1

View File

@@ -58,14 +58,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
from nanovllm.kvcache.policies import get_policy
policy = get_policy(getattr(config, 'offload_policy', 'lru'))
num_prefetch_blocks = getattr(config, 'num_prefetch_blocks', 2)
return HybridKVCacheManager(
num_gpu_slots=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=config.kvcache_block_size,
policy=policy,
num_prefetch_blocks=num_prefetch_blocks,
)

View File

@@ -86,7 +86,6 @@ class HybridKVCacheManager(KVCacheManager):
num_cpu_blocks: int,
block_size: int,
policy: Optional[EvictionPolicy] = None,
num_prefetch_blocks: int = 2,
):
"""
Initialize hybrid manager with CPU-primary ring buffer design.
@@ -99,13 +98,11 @@ class HybridKVCacheManager(KVCacheManager):
num_cpu_blocks: Number of CPU pool blocks (primary storage)
block_size: Tokens per block
policy: Eviction policy (default: LRU, used for prefix cache management)
num_prefetch_blocks: Number of blocks for ring buffer pipeline (deprecated, ring_slots = num_gpu_slots)
"""
self._block_size = block_size
self.num_gpu_slots = num_gpu_slots
self.num_cpu_blocks = num_cpu_blocks
self.total_blocks = num_gpu_slots + num_cpu_blocks
self.num_prefetch_blocks = num_prefetch_blocks # Ring buffer design parameter (deprecated)
# Eviction policy
self.policy = policy or LRUPolicy()
@@ -170,7 +167,6 @@ class HybridKVCacheManager(KVCacheManager):
num_kv_heads=num_kv_heads,
head_dim=head_dim,
dtype=dtype,
num_prefetch_blocks=self.num_prefetch_blocks,
)
def get_layer_cache(self, layer_id: int) -> Tuple[Tensor, Tensor]:

View File

@@ -53,7 +53,6 @@ class OffloadEngine:
head_dim: int,
dtype: torch.dtype = torch.float16,
num_streams: int = 4,
num_prefetch_blocks: int = 2,
):
self.num_layers = num_layers
self.num_gpu_blocks = num_gpu_blocks
@@ -82,8 +81,6 @@ class OffloadEngine:
self.decode_load_slots = list(range(1, num_gpu_blocks))
self.num_decode_load_slots = len(self.decode_load_slots)
# Keep num_prefetch_blocks for compatibility (used as chunk size for loading)
self.num_prefetch_blocks = num_prefetch_blocks
self.num_gpu_slots = num_gpu_blocks # alias
logger.info(f"Unified Ring Buffer: {self.num_ring_slots} slots total")

View File

@@ -378,9 +378,9 @@ class Attention(nn.Module):
offload_engine = kvcache_manager.offload_engine
# Use prefetch_size as chunk size for double buffering
# This ensures both Compute and Prefetch regions can hold a full chunk
chunk_size = offload_engine.num_prefetch_blocks
# Chunk size = capacity of each double buffer region (compute/prefetch)
# Each region uses half of decode_load_slots
chunk_size = max(1, len(offload_engine.decode_load_slots) // 2)
num_chunks = (len(cpu_block_table) + chunk_size - 1) // chunk_size
o_acc = None