[refactor] Refactor offload code to multi-chunk.

2025-12-15 01:13:58 +08:00
parent 5949537faf
commit 1081ab51ea
7 changed files with 36 additions and 233 deletions
--- a/nanovllm/kvcache/hybrid_manager.py
+++ b/nanovllm/kvcache/hybrid_manager.py
@@ -336,7 +336,7 @@ class HybridKVCacheManager(KVCacheManager):
        """
        Allocate logical blocks for prefill.

-        In cpu_primary mode (Ping-Pong): All blocks are allocated to CPU.
+        In cpu_primary mode (Chunked Offload): All blocks are allocated to CPU.
        In legacy mode: Blocks are allocated to GPU when possible, overflow to CPU.
        """
        assert not seq.block_table, "Sequence already has blocks"
@@ -1167,9 +1167,9 @@ class HybridKVCacheManager(KVCacheManager):
            return block.cpu_block_id
        return -1

-    def get_write_slot_for_pingpong(self, seq: Sequence) -> int:
+    def get_write_slot_for_chunked_offload(self, seq: Sequence) -> int:
        """
-        Get GPU slot for writing new KV during three-region decode.
+        Get GPU slot for writing new KV during chunked offload decode.

        In three-region design, always use Decode region (slot 0) to write new KV.
        This avoids conflicts with Compute/Prefetch region loading operations.