[refactor] Refactor offload code to multi-chunk.

This commit is contained in:
Zijie Tian
2025-12-15 01:13:58 +08:00
parent 5949537faf
commit 1081ab51ea
7 changed files with 36 additions and 233 deletions

View File

@@ -336,7 +336,7 @@ class HybridKVCacheManager(KVCacheManager):
"""
Allocate logical blocks for prefill.
In cpu_primary mode (Ping-Pong): All blocks are allocated to CPU.
In cpu_primary mode (Chunked Offload): All blocks are allocated to CPU.
In legacy mode: Blocks are allocated to GPU when possible, overflow to CPU.
"""
assert not seq.block_table, "Sequence already has blocks"
@@ -1167,9 +1167,9 @@ class HybridKVCacheManager(KVCacheManager):
return block.cpu_block_id
return -1
def get_write_slot_for_pingpong(self, seq: Sequence) -> int:
def get_write_slot_for_chunked_offload(self, seq: Sequence) -> int:
"""
Get GPU slot for writing new KV during three-region decode.
Get GPU slot for writing new KV during chunked offload decode.
In three-region design, always use Decode region (slot 0) to write new KV.
This avoids conflicts with Compute/Prefetch region loading operations.