[feat] Need to optimized with async prefetch.

This commit is contained in:
Zijie Tian
2025-12-15 06:58:40 +08:00
parent 1081ab51ea
commit b8b6478506
9 changed files with 556 additions and 404 deletions

View File

@@ -95,16 +95,16 @@ class HybridKVCacheManager(KVCacheManager):
num_cpu_blocks: Number of CPU pool blocks (overflow or primary storage)
block_size: Tokens per block
policy: Eviction policy (default: LRU)
cpu_primary: If True, use CPU as primary storage with three-region GPU buffer.
cpu_primary: If True, use CPU as primary storage with ring buffer GPU design.
If False, use GPU as primary with CPU as overflow (legacy mode).
num_prefetch_blocks: Number of prefetch blocks for three-region GPU buffer design
num_prefetch_blocks: Number of blocks for ring buffer pipeline (deprecated, ring_slots = num_gpu_slots)
"""
self._block_size = block_size
self.num_gpu_slots = num_gpu_slots
self.num_cpu_blocks = num_cpu_blocks
self.total_blocks = num_gpu_slots + num_cpu_blocks
self.cpu_primary = cpu_primary # Three-region mode flag
self.num_prefetch_blocks = num_prefetch_blocks # Three-region design parameter
self.cpu_primary = cpu_primary # Ring buffer mode flag
self.num_prefetch_blocks = num_prefetch_blocks # Ring buffer design parameter (deprecated)
# Eviction policy
self.policy = policy or LRUPolicy()
@@ -341,7 +341,7 @@ class HybridKVCacheManager(KVCacheManager):
"""
assert not seq.block_table, "Sequence already has blocks"
# Three-region mode: all blocks are allocated to CPU
# Ring buffer mode: all blocks are allocated to CPU
if self.cpu_primary:
return self.allocate_cpu_only(seq)
@@ -471,7 +471,7 @@ class HybridKVCacheManager(KVCacheManager):
block.token_ids = []
if self.cpu_primary:
# Three-region mode: new block allocated to CPU
# Ring buffer mode: new block allocated to CPU
if not self.free_cpu_blocks:
raise RuntimeError("No free CPU blocks for decode")
cpu_block_id = self.free_cpu_blocks.popleft()
@@ -1025,14 +1025,14 @@ class HybridKVCacheManager(KVCacheManager):
break
return pos
# ========== Three-region double buffering support ==========
# ========== Ring Buffer CPU-primary support ==========
def allocate_cpu_only(self, seq: Sequence) -> None:
"""
Allocate CPU blocks for sequence (for three-region mode).
Allocate CPU blocks for sequence (for ring buffer mode).
Unlike allocate(), here all blocks are allocated to CPU,
GPU is only used as working buffer.
GPU is only used as ring buffer for computation.
Args:
seq: Sequence to allocate
@@ -1092,10 +1092,10 @@ class HybridKVCacheManager(KVCacheManager):
cpu_blocks.append(block.cpu_block_id)
else:
# If block is on GPU, it should have a corresponding CPU block
# In three-region mode, all data ultimately resides on CPU
# In ring buffer mode, all data ultimately resides on CPU
raise RuntimeError(
f"Block {logical_id} not on CPU (location={block.location}). "
f"In three-region mode, all blocks should be on CPU."
f"In ring buffer mode, all blocks should be on CPU."
)
return cpu_blocks
@@ -1171,8 +1171,8 @@ class HybridKVCacheManager(KVCacheManager):
"""
Get GPU slot for writing new KV during chunked offload decode.
In three-region design, always use Decode region (slot 0) to write new KV.
This avoids conflicts with Compute/Prefetch region loading operations.
In ring buffer design, always use decode_slot (slot[0]) to write new KV.
This avoids conflicts with loading operations which use slots[1:].
Args:
seq: Sequence