[refactor] Refactor current gpu and cpu block allocation strategy.

This commit is contained in:
Zijie Tian
2025-12-10 21:23:31 +08:00
parent 0a247ccb1b
commit 190df5f70d
7 changed files with 906 additions and 162 deletions

View File

@@ -81,20 +81,24 @@ class HybridKVCacheManager(KVCacheManager):
num_cpu_blocks: int,
block_size: int,
policy: Optional[EvictionPolicy] = None,
cpu_primary: bool = True,
):
"""
Initialize hybrid manager.
Args:
num_gpu_slots: Number of GPU buffer slots (working set)
num_cpu_blocks: Number of CPU pool blocks (overflow)
num_cpu_blocks: Number of CPU pool blocks (overflow or primary storage)
block_size: Tokens per block
policy: Eviction policy (default: LRU)
cpu_primary: If True, use CPU as primary storage with Ping-Pong GPU buffer.
If False, use GPU as primary with CPU as overflow (legacy mode).
"""
self._block_size = block_size
self.num_gpu_slots = num_gpu_slots
self.num_cpu_blocks = num_cpu_blocks
self.total_blocks = num_gpu_slots + num_cpu_blocks
self.cpu_primary = cpu_primary # Ping-Pong mode flag
# Eviction policy
self.policy = policy or LRUPolicy()
@@ -321,12 +325,16 @@ class HybridKVCacheManager(KVCacheManager):
"""
Allocate logical blocks for prefill.
New blocks are allocated on GPU when possible. If GPU is full and all
GPU blocks belong to this sequence (can't evict), remaining blocks
are allocated to CPU for chunked prefill.
In cpu_primary mode (Ping-Pong): All blocks are allocated to CPU.
In legacy mode: Blocks are allocated to GPU when possible, overflow to CPU.
"""
assert not seq.block_table, "Sequence already has blocks"
# Ping-Pong模式所有blocks都分配到CPU
if self.cpu_primary:
return self.allocate_cpu_only(seq)
# Legacy模式GPU为主CPU为overflow
h = -1
cache_miss = False
@@ -451,13 +459,22 @@ class HybridKVCacheManager(KVCacheManager):
block.hash = -1
block.token_ids = []
# New decode blocks go to GPU
gpu_slot = self._allocate_gpu_slot()
block.location = BlockLocation.GPU
block.gpu_slot = gpu_slot
self.gpu_slot_to_logical[gpu_slot] = logical_id
self.policy.on_block_allocated(gpu_slot, self.current_step)
if self.cpu_primary:
# Ping-Pong模式新block分配到CPU
if not self.free_cpu_blocks:
raise RuntimeError("No free CPU blocks for decode")
cpu_block_id = self.free_cpu_blocks.popleft()
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
else:
# Legacy模式新block分配到GPU
gpu_slot = self._allocate_gpu_slot()
block.location = BlockLocation.GPU
block.gpu_slot = gpu_slot
self.gpu_slot_to_logical[gpu_slot] = logical_id
self.policy.on_block_allocated(gpu_slot, self.current_step)
block_table.append(logical_id)
@@ -993,6 +1010,158 @@ class HybridKVCacheManager(KVCacheManager):
break
return pos
# ========== Ping-Pong 双缓冲支持 ==========
def allocate_cpu_only(self, seq: Sequence) -> None:
"""
为序列分配 CPU blocks用于 Ping-Pong 模式)。
与 allocate() 不同,这里所有 blocks 都分配到 CPU
GPU 只用作工作缓冲区。
Args:
seq: 要分配的序列
"""
assert not seq.block_table, "Sequence already has blocks"
for i in range(seq.num_blocks):
# 分配 CPU block
if not self.free_cpu_blocks:
raise RuntimeError(
f"No free CPU blocks. Need {seq.num_blocks}, "
f"available: {len(self.free_cpu_blocks)}"
)
cpu_block_id = self.free_cpu_blocks.popleft()
# 分配逻辑块
logical_id = self.free_logical_ids.popleft()
block = self.logical_blocks[logical_id]
block.ref_count = 1
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
seq.block_table.append(logical_id)
def get_cpu_block_table(self, seq: Sequence) -> List[int]:
"""
获取序列的 CPU block ID 列表。
Args:
seq: 序列
Returns:
CPU block IDs 列表,按序列顺序
"""
cpu_blocks = []
for logical_id in seq.block_table:
block = self.logical_blocks[logical_id]
if block.location == BlockLocation.CPU:
cpu_blocks.append(block.cpu_block_id)
else:
# 如果 block 在 GPU 上,它应该有一个对应的 CPU block
# 在 Ping-Pong 模式下,所有数据最终都在 CPU 上
raise RuntimeError(
f"Block {logical_id} not on CPU (location={block.location}). "
f"In Ping-Pong mode, all blocks should be on CPU."
)
return cpu_blocks
def get_all_cpu_blocks(self, seq: Sequence) -> Tuple[List[int], List[int]]:
"""
获取序列的所有 CPU blocks 及其逻辑 ID。
Args:
seq: 序列
Returns:
(cpu_block_ids, logical_ids)
"""
cpu_block_ids = []
logical_ids = []
for logical_id in seq.block_table:
block = self.logical_blocks[logical_id]
if block.location == BlockLocation.CPU:
cpu_block_ids.append(block.cpu_block_id)
logical_ids.append(logical_id)
return cpu_block_ids, logical_ids
def allocate_next_cpu_block(self, seq: Sequence) -> int:
"""
为序列分配下一个 CPU block用于 decode 时新 token
Args:
seq: 序列
Returns:
新分配的 CPU block ID
"""
if not self.free_cpu_blocks:
raise RuntimeError("No free CPU blocks")
cpu_block_id = self.free_cpu_blocks.popleft()
logical_id = self.free_logical_ids.popleft()
block = self.logical_blocks[logical_id]
block.ref_count = 1
block.location = BlockLocation.CPU
block.cpu_block_id = cpu_block_id
block.gpu_slot = -1
self.cpu_block_to_logical[cpu_block_id] = logical_id
seq.block_table.append(logical_id)
return cpu_block_id
def get_last_cpu_block(self, seq: Sequence) -> int:
"""
获取序列最后一个 block 的 CPU block ID。
如果最后一个 block 不在 CPU 上,返回 -1。
Args:
seq: 序列
Returns:
CPU block ID如果不在 CPU 上则返回 -1
"""
if not seq.block_table:
return -1
last_logical_id = seq.block_table[-1]
block = self.logical_blocks[last_logical_id]
if block.location == BlockLocation.CPU:
return block.cpu_block_id
return -1
def get_write_slot_for_pingpong(self, seq: Sequence) -> int:
"""
获取 Ping-Pong decode 时新 KV 写入的 GPU slot。
策略:使用序列所需 chunks 数决定最后用的是 Ping 还是 Pong buffer
然后使用该 buffer 的最后一个 slot。
Args:
seq: 序列
Returns:
GPU slot ID
"""
cpu_blocks, _ = self.get_all_cpu_blocks(seq)
ping_size = self.offload_engine.ping_size
num_chunks = (len(cpu_blocks) + ping_size - 1) // ping_size if cpu_blocks else 0
# 最后一个 chunk 用的是哪个 buffer
if num_chunks % 2 == 1 or num_chunks == 0:
# 奇数个 chunk或0个最后用的是 ping
return self.offload_engine.ping_slots[-1]
else:
# 偶数个 chunk最后用的是 pong
return self.offload_engine.pong_slots[-1]
def __repr__(self) -> str:
return (
f"HybridKVCacheManager(\n"