[fix] Fixed kvcache offload bugs.

2025-12-10 22:34:00 +08:00
parent 190df5f70d
commit e85c2b4776
7 changed files with 409 additions and 156 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -22,6 +22,7 @@ class Config:
    offload_policy: str = "lru"  # "lru", "fifo", or full class path
    num_transfer_streams: int = 4  # Number of CUDA streams for async transfers
    num_gpu_blocks: int = -1  # User-specified GPU blocks count, -1 = auto (use max available)
    num_prefetch_blocks: int = 2  # Prefetch区的block数量，用于三区域GPU Buffer设计
    # Computed fields for offload (set in __post_init__ or by ModelRunner)
    num_gpu_kvcache_blocks: int = -1
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -148,19 +148,39 @@ class ModelRunner:
            dtype=hf_config.torch_dtype,
        )
-        # Log KV cache allocation info
+        # Log KV cache allocation info with detailed per-token breakdown
        gpu_memory_mb = config.num_gpu_kvcache_blocks * block_bytes / (1024 ** 2)
        cpu_memory_mb = config.num_cpu_kvcache_blocks * block_bytes / (1024 ** 2)
        total_memory_mb = gpu_memory_mb + cpu_memory_mb
        # Calculate per-token KV cache usage
        # KV per token = 2 (K+V) * num_layers * kv_heads * head_dim * dtype_size
        dtype_size = 2 if hf_config.torch_dtype in [torch.float16, torch.bfloat16] else 4
        per_token_kv_bytes = 2 * hf_config.num_hidden_layers * num_kv_heads * head_dim * dtype_size
        per_token_kv_kb = per_token_kv_bytes / 1024
        logger.info(
            f"KV Cache per-token: {per_token_kv_kb:.2f}KB "
            f"(2 * {hf_config.num_hidden_layers}layers * {num_kv_heads}kv_heads * {head_dim}head_dim * {dtype_size}bytes)"
        )
        logger.info(
            f"KV Cache per-block: {block_bytes / (1024**2):.2f}MB "
            f"({per_token_kv_kb:.2f}KB * {self.block_size}tokens)"
        )
        if config.enable_cpu_offload:
            ping_size = config.num_gpu_kvcache_blocks // 2
            tokens_per_ping = ping_size * self.block_size
            logger.info(
                f"KV Cache allocated (Ping-Pong mode): "
                f"GPU={config.num_gpu_kvcache_blocks} blocks ({gpu_memory_mb:.1f}MB), "
                f"CPU={config.num_cpu_kvcache_blocks} blocks ({cpu_memory_mb:.1f}MB), "
-                f"Total={total_memory_mb:.1f}MB, "
+                f"Total={total_memory_mb:.1f}MB"
-                f"block_size={self.block_size}, "
+            )
-                f"ping_size={config.num_gpu_kvcache_blocks // 2}"
+            logger.info(
                f"Ping-Pong config: ping_size={ping_size} blocks, "
                f"tokens_per_chunk={tokens_per_ping}, "
                f"block_size={self.block_size}"
            )
        else:
            logger.info(
@@ -392,12 +412,12 @@ class ModelRunner:
    def _should_use_pingpong(self, seqs: list[Sequence], is_prefill: bool) -> bool:
        """
-        Check if Ping-Pong mode should be used.
+        Check if 三区域 mode should be used.
-        Use Ping-Pong when:
+        Use 三区域 when:
        - CPU offload is enabled
        - There are blocks on CPU (either allocated there or offloaded)
-        - Sequence exceeds GPU capacity
+        - Sequence exceeds GPU Compute区 capacity
        """
        if not hasattr(self.kvcache_manager, 'offload_engine'):
            return False
@@ -409,12 +429,12 @@ class ModelRunner:
            # Check if any blocks are on CPU
            cpu_blocks, _ = self.kvcache_manager.get_all_cpu_blocks(seq)
            if cpu_blocks:
-                # Has CPU blocks - use Ping-Pong
+                # Has CPU blocks - use 三区域
                return True
-            # Check if sequence needs more blocks than GPU can hold
+            # Check if sequence needs more blocks than GPU Compute区 can hold
-            ping_size = self.kvcache_manager.offload_engine.ping_size
+            compute_size = self.kvcache_manager.offload_engine.num_compute_blocks
-            if seq.num_blocks > ping_size:
+            if seq.num_blocks > compute_size:
                # Needs chunked processing
                return True
@@ -610,29 +630,28 @@ class ModelRunner:
    def run_pingpong_prefill(self, seqs: list[Sequence]) -> list[int]:
        """
-        Run prefill with Ping-Pong dual buffer (CPU is primary storage).
+        Run prefill with 三区域 GPU buffer (CPU is primary storage).
        Flow:
        1. All blocks are allocated to CPU (primary storage)
-        2. Process tokens in chunks using Ping/Pong GPU buffers
+        2. Process tokens in chunks using Compute区 GPU buffer
-        3. After each chunk, offload from GPU to CPU
+        3. After each chunk, offload from Compute区 to CPU
-        4. Alternate between Ping and Pong buffers
+        4. Prefetch区 用于加载 previous KV（如果有的话）
        """
        import sys
-        assert len(seqs) == 1, "Ping-Pong prefill only supports single sequence"
+        assert len(seqs) == 1, "三区域 prefill only supports single sequence"
        seq = seqs[0]
        offload_engine = self.kvcache_manager.offload_engine
-        ping_size = offload_engine.ping_size
+        compute_size = offload_engine.num_compute_blocks
-        tokens_per_chunk = ping_size * self.block_size
+        tokens_per_chunk = compute_size * self.block_size
        total_tokens = len(seq)
-        print(f"[Ping-Pong Prefill] Starting: {total_tokens} tokens, "
+        print(f"[三区域 Prefill] Starting: {total_tokens} tokens, "
-              f"ping_size={ping_size} blocks, chunk={tokens_per_chunk} tokens",
+              f"compute_size={compute_size} blocks, chunk={tokens_per_chunk} tokens",
              file=sys.stderr)
        current_buffer = "ping"
        chunk_num = 0
        logits = None
        processed_tokens = 0
@@ -651,15 +670,13 @@ class ModelRunner:
            end_block_idx = (chunk_end + self.block_size - 1) // self.block_size
            num_blocks = end_block_idx - start_block_idx
-            print(f"[Ping-Pong Prefill] Chunk {chunk_num}: tokens {chunk_start}-{chunk_end}, "
+            print(f"[三区域 Prefill] Chunk {chunk_num}: tokens {chunk_start}-{chunk_end}, "
-                  f"blocks {start_block_idx}-{end_block_idx-1}, buffer={current_buffer}",
+                  f"blocks {start_block_idx}-{end_block_idx-1}, "
                  f"compute_slots={offload_engine.compute_slots[:num_blocks]}",
                  file=sys.stderr)
-            # Get GPU slots for this chunk (Ping or Pong buffer)
+            # Get GPU slots for this chunk (使用 Compute区)
-            if current_buffer == "ping":
+            gpu_slots = offload_engine.compute_slots[:num_blocks]
                gpu_slots = offload_engine.ping_slots[:num_blocks]
            else:
                gpu_slots = offload_engine.pong_slots[:num_blocks]
            # Prepare inputs
            input_ids, positions = self._prepare_pingpong_chunk(
@@ -678,24 +695,19 @@ class ModelRunner:
                logical_id = seq.block_table[i]
                self.kvcache_manager.prefilled_blocks.add(logical_id)
-            # Offload this chunk from GPU to CPU (async)
+            # Offload this chunk from Compute区 to CPU (async)
            chunk_cpu_blocks = cpu_block_ids[start_block_idx:end_block_idx]
-            offload_engine.offload_buffer_to_cpu(current_buffer, chunk_cpu_blocks)
+            offload_engine.offload_compute_to_cpu(chunk_cpu_blocks)
-            # Switch buffer for next chunk
+            # Wait for offload to complete before next chunk
-            if current_buffer == "ping":
+            offload_engine.wait_all_offload_done()
                offload_engine.wait_ping_offload_done()
                current_buffer = "pong"
            else:
                offload_engine.wait_pong_offload_done()
                current_buffer = "ping"
            processed_tokens = chunk_end
        # Wait for all offloads to complete
        offload_engine.wait_all_offload_done()
-        print(f"[Ping-Pong Prefill] Complete: {chunk_num} chunks", file=sys.stderr)
+        print(f"[三区域 Prefill] Complete: {chunk_num} chunks", file=sys.stderr)
        # Sample from last logits
        temperatures = self.prepare_sample(seqs) if self.rank == 0 else None
@@ -764,25 +776,26 @@ class ModelRunner:
    def run_pingpong_decode(self, seqs: list[Sequence]) -> list[int]:
        """
-        Run decode with Ping-Pong dual buffer.
+        Run decode with 三区域 GPU buffer.
-        All KV is on CPU. Uses Ping-Pong to load KV chunks and compute attention.
+        All KV is on CPU. Uses Decode区 to write new KV, Compute/Prefetch区 to load KV chunks.
-        New token's KV is written to GPU then offloaded to CPU.
+        New token's KV is written to Decode区 (slot 0) then offloaded to CPU.
        关键：Decode区 永远不会被 Compute/Prefetch 覆盖，专门用于写入新KV。
        """
-        assert len(seqs) == 1, "Ping-Pong decode only supports single sequence"
+        assert len(seqs) == 1, "三区域 decode only supports single sequence"
        seq = seqs[0]
        offload_engine = self.kvcache_manager.offload_engine
        # Prepare inputs
        input_ids = torch.tensor([seq.last_token], dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
        positions = torch.tensor([len(seq) - 1], dtype=torch.int64, pin_memory=True).cuda(non_blocking=True)
-        # Get write slot for new KV (will use last slot of the buffer used for final chunk)
+        # 使用 Decode区 (slot 0) 写入新 KV
-        write_slot = self.kvcache_manager.get_write_slot_for_pingpong(seq)
+        decode_slot = offload_engine.decode_slot  # = 0
        # Calculate position in block for slot mapping
        last_block_idx = seq.num_blocks - 1
        pos_in_block = (len(seq) - 1) % self.block_size
-        slot = write_slot * self.block_size + pos_in_block
+        slot = decode_slot * self.block_size + pos_in_block
        slot_mapping = torch.tensor([slot], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
        context_len = torch.tensor([len(seq)], dtype=torch.int32, pin_memory=True).cuda(non_blocking=True)
@@ -794,17 +807,18 @@ class ModelRunner:
            is_chunked_prefill=True,  # Use chunked attention path
            offload_engine=self.kvcache_manager,
            chunked_seq=seq,
            decode_pos_in_block=pos_in_block,
        )
        # Run model forward pass
        logits = self.run_model(input_ids, positions, is_prefill=False)
        reset_context()
-        # Offload new KV from write_slot to CPU
+        # Offload new KV from Decode区 to CPU
        last_cpu_block = self.kvcache_manager.get_last_cpu_block(seq)
        if last_cpu_block >= 0:
-            self.kvcache_manager.offload_engine.offload_slot_to_cpu(write_slot, last_cpu_block)
+            offload_engine.offload_decode_slot(last_cpu_block)
-            self.kvcache_manager.offload_engine.wait_all_offload_done()
+            offload_engine.wait_all_offload_done()
        # Sample
        temperatures = self.prepare_sample(seqs) if self.rank == 0 else None
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -58,12 +58,14 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
    from nanovllm.kvcache.policies import get_policy
    policy = get_policy(getattr(config, 'offload_policy', 'lru'))
    num_prefetch_blocks = getattr(config, 'num_prefetch_blocks', 2)
    return HybridKVCacheManager(
        num_gpu_slots=num_gpu_blocks,
        num_cpu_blocks=num_cpu_blocks,
        block_size=config.kvcache_block_size,
        policy=policy,
        num_prefetch_blocks=num_prefetch_blocks,
    )
--- a/nanovllm/kvcache/hybrid_manager.py
+++ b/nanovllm/kvcache/hybrid_manager.py
@@ -9,6 +9,7 @@ Key design for CUDA Graph compatibility:
 5. Graph replay only needs index updates (tiny overhead)
 """
 import logging
 from collections import deque
 from dataclasses import dataclass, field
 from enum import Enum, auto
@@ -16,6 +17,8 @@ from typing import List, Tuple, Dict, Set, Optional
 import torch
 from torch import Tensor
 logger = logging.getLogger(__name__)
 from nanovllm.engine.sequence import Sequence
 from nanovllm.kvcache.base_manager import KVCacheManager
 from nanovllm.kvcache.offload_engine import OffloadEngine
@@ -82,6 +85,7 @@ class HybridKVCacheManager(KVCacheManager):
        block_size: int,
        policy: Optional[EvictionPolicy] = None,
        cpu_primary: bool = True,
        num_prefetch_blocks: int = 2,
    ):
        """
        Initialize hybrid manager.
@@ -91,14 +95,16 @@ class HybridKVCacheManager(KVCacheManager):
            num_cpu_blocks: Number of CPU pool blocks (overflow or primary storage)
            block_size: Tokens per block
            policy: Eviction policy (default: LRU)
-            cpu_primary: If True, use CPU as primary storage with Ping-Pong GPU buffer.
+            cpu_primary: If True, use CPU as primary storage with 三区域 GPU buffer.
                        If False, use GPU as primary with CPU as overflow (legacy mode).
            num_prefetch_blocks: Number of prefetch blocks for 三区域 GPU buffer design
        """
        self._block_size = block_size
        self.num_gpu_slots = num_gpu_slots
        self.num_cpu_blocks = num_cpu_blocks
        self.total_blocks = num_gpu_slots + num_cpu_blocks
-        self.cpu_primary = cpu_primary  # Ping-Pong mode flag
+        self.cpu_primary = cpu_primary  # 三区域 mode flag
        self.num_prefetch_blocks = num_prefetch_blocks  # 三区域设计参数
        # Eviction policy
        self.policy = policy or LRUPolicy()
@@ -156,6 +162,7 @@ class HybridKVCacheManager(KVCacheManager):
            num_kv_heads=num_kv_heads,
            head_dim=head_dim,
            dtype=dtype,
            num_prefetch_blocks=self.num_prefetch_blocks,
        )
    def get_layer_cache(self, layer_id: int) -> Tuple[Tensor, Tensor]:
@@ -948,6 +955,10 @@ class HybridKVCacheManager(KVCacheManager):
                block = self.logical_blocks[logical_id]
                if block.location == BlockLocation.CPU:
                    cpu_blocks.append(block.cpu_block_id)
        logger.debug(
            f"get_prefilled_cpu_blocks: prefilled_blocks={list(self.prefilled_blocks)}, "
            f"returned cpu_blocks={cpu_blocks}"
        )
        return cpu_blocks
    def load_prev_kv_for_layer(
@@ -1139,28 +1150,18 @@ class HybridKVCacheManager(KVCacheManager):
    def get_write_slot_for_pingpong(self, seq: Sequence) -> int:
        """
-        获取 Ping-Pong decode 时新 KV 写入的 GPU slot。
+        获取三区域 decode 时新 KV 写入的 GPU slot。
-        策略：使用序列所需 chunks 数决定最后用的是 Ping 还是 Pong buffer，
+        在三区域设计中，永远使用 Decode区 (slot 0) 写入新 KV。
-        然后使用该 buffer 的最后一个 slot。
+        这样可以避免与 Compute/Prefetch区 的加载操作冲突。
        Args:
            seq: 序列
        Returns:
-            GPU slot ID
+            GPU slot ID (永远是 decode_slot = 0)
        """
-        cpu_blocks, _ = self.get_all_cpu_blocks(seq)
+        return self.offload_engine.decode_slot
        ping_size = self.offload_engine.ping_size
        num_chunks = (len(cpu_blocks) + ping_size - 1) // ping_size if cpu_blocks else 0
        # 最后一个 chunk 用的是哪个 buffer
        if num_chunks % 2 == 1 or num_chunks == 0:
            # 奇数个 chunk（或0个），最后用的是 ping
            return self.offload_engine.ping_slots[-1]
        else:
            # 偶数个 chunk，最后用的是 pong
            return self.offload_engine.pong_slots[-1]
    def __repr__(self) -> str:
        return (
--- a/nanovllm/kvcache/offload_engine.py
+++ b/nanovllm/kvcache/offload_engine.py
@@ -53,6 +53,7 @@ class OffloadEngine:
        head_dim: int,
        dtype: torch.dtype = torch.float16,
        num_streams: int = 4,
        num_prefetch_blocks: int = 2,
    ):
        self.num_layers = num_layers
        self.num_gpu_blocks = num_gpu_blocks
@@ -64,31 +65,59 @@ class OffloadEngine:
        self.kv_dim = num_kv_heads * head_dim
        self.block_numel = block_size * self.kv_dim
-        # ========== Ping-Pong 双缓冲配置 ==========
+        # ========== 三区域 GPU Buffer 配置 ==========
-        assert num_gpu_blocks >= 2, "Ping-Pong需要至少2个GPU blocks"
+        # 约束检查
-        self.ping_size = num_gpu_blocks // 2
+        assert num_gpu_blocks >= 3, \
-        self.pong_size = num_gpu_blocks - self.ping_size
+            f"至少需要3个GPU blocks: 1 decode + 1 compute + 1 prefetch, got {num_gpu_blocks}"
-        self.ping_slots = list(range(self.ping_size))  # [0, 1, 2, ...]
+        assert num_prefetch_blocks >= 1, \
-        self.pong_slots = list(range(self.ping_size, num_gpu_blocks))  # [ping_size, ping_size+1, ...]
+            f"至少需要1个prefetch block, got {num_prefetch_blocks}"
        assert num_gpu_blocks >= 1 + 1 + num_prefetch_blocks, \
            f"GPU blocks不足: 需要 1(decode) + 1(compute) + {num_prefetch_blocks}(prefetch), got {num_gpu_blocks}"
        # 三区域配置
        # Decode区: [0] - 固定1个block用于写入新KV
        self.decode_slot = 0
        # Compute区: [1, ..., num_gpu_blocks - num_prefetch_blocks - 1]
        compute_start = 1
        compute_end = num_gpu_blocks - num_prefetch_blocks
        self.compute_slots = list(range(compute_start, compute_end))
        self.num_compute_blocks = len(self.compute_slots)
        # Prefetch区: [num_gpu_blocks - num_prefetch_blocks, ..., num_gpu_blocks - 1]
        prefetch_start = compute_end
        self.prefetch_slots = list(range(prefetch_start, num_gpu_blocks))
        self.num_prefetch_blocks = num_prefetch_blocks
        self.num_gpu_slots = num_gpu_blocks  # alias
        # 保留旧的ping/pong属性以兼容（后续会移除）
        self.ping_size = self.num_compute_blocks
        self.pong_size = self.num_prefetch_blocks
        self.ping_slots = self.compute_slots.copy()
        self.pong_slots = self.prefetch_slots.copy()
        logger.info(f"三区域 GPU Buffer: decode_slot={self.decode_slot}, "
                    f"compute_slots={self.compute_slots}, prefetch_slots={self.prefetch_slots}")
        # ========== Fixed-address GPU KV cache ==========
        # Shape: [num_layers, num_gpu_blocks, block_size, kv_heads, head_dim]
-        self.k_cache_gpu = torch.empty(
+        # 使用 zeros 初始化以避免未初始化内存问题
        self.k_cache_gpu = torch.zeros(
            num_layers, num_gpu_blocks, block_size, num_kv_heads, head_dim,
            dtype=dtype, device="cuda"
        )
-        self.v_cache_gpu = torch.empty(
+        self.v_cache_gpu = torch.zeros(
            num_layers, num_gpu_blocks, block_size, num_kv_heads, head_dim,
            dtype=dtype, device="cuda"
        )
        # ========== Fixed-address CPU KV cache (pinned memory) ==========
-        self.k_cache_cpu = torch.empty(
+        self.k_cache_cpu = torch.zeros(
            num_layers, num_cpu_blocks, block_size, num_kv_heads, head_dim,
            dtype=dtype, device="cpu", pin_memory=True
        )
-        self.v_cache_cpu = torch.empty(
+        self.v_cache_cpu = torch.zeros(
            num_layers, num_cpu_blocks, block_size, num_kv_heads, head_dim,
            dtype=dtype, device="cpu", pin_memory=True
        )
@@ -111,14 +140,18 @@ class OffloadEngine:
        self.compute_stream = torch.cuda.current_stream()
        self._stream_idx = 0
-        # ========== Ping-Pong 专用 stream 和事件 ==========
+        # ========== 三区域专用 stream 和事件 ==========
-        self.pingpong_stream = torch.cuda.Stream()  # 专用于Ping-Pong传输
+        self.transfer_stream_main = torch.cuda.Stream()  # 主传输stream
-        # 同步事件 - 加载完成
+        # 同步事件 - 三区域加载完成
-        self.ping_ready = torch.cuda.Event()
+        self.compute_ready = torch.cuda.Event()
-        self.pong_ready = torch.cuda.Event()
+        self.prefetch_ready = torch.cuda.Event()
        self.decode_offload_done = torch.cuda.Event()
-        # 同步事件 - offload完成
+        # 保留旧的ping/pong事件以兼容（后续会移除）
        self.pingpong_stream = self.transfer_stream_main
        self.ping_ready = self.compute_ready
        self.pong_ready = self.prefetch_ready
        self.ping_offload_done = torch.cuda.Event()
        self.pong_offload_done = torch.cuda.Event()
@@ -535,7 +568,7 @@ class OffloadEngine:
            f"  kv_heads={self.num_kv_heads},\n"
            f"  head_dim={self.head_dim},\n"
            f"  dtype={self.dtype},\n"
-            f"  ping_size={self.ping_size}, pong_size={self.pong_size},\n"
+            f"  三区域: decode_slot={self.decode_slot}, compute={self.compute_slots}, prefetch={self.prefetch_slots},\n"
            f"  gpu_memory={self.gpu_memory_bytes() / 1024**2:.1f}MB,\n"
            f"  cpu_memory={self.cpu_memory_bytes() / 1024**2:.1f}MB\n"
            f")"
@@ -742,4 +775,189 @@ class OffloadEngine:
        v = self.v_cache_gpu[layer_id, gpu_slots]
        k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
        v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
-        return k, v
+        return k, v
    # ========== 三区域 GPU Buffer 方法 ==========
    def load_to_compute(self, cpu_block_ids: List[int]) -> None:
        """
        异步加载CPU blocks到Compute区。
        Args:
            cpu_block_ids: 要加载的CPU block IDs列表
        """
        if not cpu_block_ids:
            self.compute_ready.record(self.transfer_stream_main)
            return
        num_to_load = min(len(cpu_block_ids), len(self.compute_slots))
        logger.debug(f"Compute load: CPU{cpu_block_ids[:num_to_load]} -> GPU compute slots {self.compute_slots[:num_to_load]}")
        with torch.cuda.stream(self.transfer_stream_main):
            for i in range(num_to_load):
                cpu_id = cpu_block_ids[i]
                gpu_slot = self.compute_slots[i]
                # 所有层一起复制
                self.k_cache_gpu[:, gpu_slot].copy_(
                    self.k_cache_cpu[:, cpu_id], non_blocking=True
                )
                self.v_cache_gpu[:, gpu_slot].copy_(
                    self.v_cache_cpu[:, cpu_id], non_blocking=True
                )
            self.compute_ready.record(self.transfer_stream_main)
    def load_to_prefetch(self, cpu_block_ids: List[int]) -> None:
        """
        异步加载CPU blocks到Prefetch区。
        Args:
            cpu_block_ids: 要加载的CPU block IDs列表
        """
        if not cpu_block_ids:
            self.prefetch_ready.record(self.transfer_stream_main)
            return
        num_to_load = min(len(cpu_block_ids), len(self.prefetch_slots))
        logger.debug(f"Prefetch load: CPU{cpu_block_ids[:num_to_load]} -> GPU prefetch slots {self.prefetch_slots[:num_to_load]}")
        with torch.cuda.stream(self.transfer_stream_main):
            for i in range(num_to_load):
                cpu_id = cpu_block_ids[i]
                gpu_slot = self.prefetch_slots[i]
                self.k_cache_gpu[:, gpu_slot].copy_(
                    self.k_cache_cpu[:, cpu_id], non_blocking=True
                )
                self.v_cache_gpu[:, gpu_slot].copy_(
                    self.v_cache_cpu[:, cpu_id], non_blocking=True
                )
            self.prefetch_ready.record(self.transfer_stream_main)
    def wait_compute(self) -> None:
        """等待Compute区加载完成。"""
        self.compute_stream.wait_event(self.compute_ready)
    def wait_prefetch(self) -> None:
        """等待Prefetch区加载完成。"""
        self.compute_stream.wait_event(self.prefetch_ready)
    def swap_compute_prefetch(self) -> None:
        """交换Compute区和Prefetch区的角色。"""
        self.compute_slots, self.prefetch_slots = self.prefetch_slots, self.compute_slots
        # 同时更新旧的ping/pong slots以保持兼容
        self.ping_slots, self.pong_slots = self.pong_slots, self.ping_slots
    def offload_decode_slot(self, cpu_block_id: int) -> None:
        """
        将Decode区的KV offload到CPU。
        Args:
            cpu_block_id: 目标CPU block ID
        """
        logger.debug(f"Decode offload: GPU[{self.decode_slot}] -> CPU[{cpu_block_id}]")
        with torch.cuda.stream(self.transfer_stream_main):
            self.transfer_stream_main.wait_stream(self.compute_stream)
            self.k_cache_cpu[:, cpu_block_id].copy_(
                self.k_cache_gpu[:, self.decode_slot], non_blocking=True
            )
            self.v_cache_cpu[:, cpu_block_id].copy_(
                self.v_cache_gpu[:, self.decode_slot], non_blocking=True
            )
            self.decode_offload_done.record(self.transfer_stream_main)
    def wait_decode_offload(self) -> None:
        """等待Decode区offload完成。"""
        self.compute_stream.wait_event(self.decode_offload_done)
    def get_kv_for_compute(
        self,
        layer_id: int,
        num_blocks: int,
    ) -> Tuple[Tensor, Tensor]:
        """
        获取Compute区中指定数量blocks的KV。
        Args:
            layer_id: 层ID
            num_blocks: 需要的block数量
        Returns:
            (k_cache, v_cache)，shape: [1, num_blocks * block_size, kv_heads, head_dim]
        """
        slots = self.compute_slots[:num_blocks]
        k = self.k_cache_gpu[layer_id, slots]  # [num_blocks, block_size, heads, dim]
        v = self.v_cache_gpu[layer_id, slots]
        # Reshape: [num_blocks, block_size, heads, dim] -> [1, num_blocks*block_size, heads, dim]
        k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
        v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
        return k, v
    def get_kv_for_prefetch(
        self,
        layer_id: int,
        num_blocks: int,
    ) -> Tuple[Tensor, Tensor]:
        """
        获取Prefetch区中指定数量blocks的KV。
        Args:
            layer_id: 层ID
            num_blocks: 需要的block数量
        Returns:
            (k_cache, v_cache)，shape: [1, num_blocks * block_size, kv_heads, head_dim]
        """
        slots = self.prefetch_slots[:num_blocks]
        k = self.k_cache_gpu[layer_id, slots]
        v = self.v_cache_gpu[layer_id, slots]
        k = k.reshape(1, -1, self.num_kv_heads, self.head_dim)
        v = v.reshape(1, -1, self.num_kv_heads, self.head_dim)
        return k, v
    def get_kv_for_decode_slot(
        self,
        layer_id: int,
        pos_in_block: int,
    ) -> Tuple[Tensor, Tensor]:
        """
        获取Decode区指定位置的KV（用于decode时的新token）。
        Args:
            layer_id: 层ID
            pos_in_block: token在block内的位置 (0 to block_size-1)
        Returns:
            (k_cache, v_cache)，shape: [1, 1, kv_heads, head_dim]
        """
        k = self.k_cache_gpu[layer_id, self.decode_slot, pos_in_block:pos_in_block+1]  # [1, heads, dim]
        v = self.v_cache_gpu[layer_id, self.decode_slot, pos_in_block:pos_in_block+1]
        k = k.unsqueeze(0)  # [1, 1, heads, dim]
        v = v.unsqueeze(0)
        return k, v
    def offload_compute_to_cpu(self, cpu_block_ids: List[int]) -> None:
        """
        将Compute区的KV offload到CPU。
        Args:
            cpu_block_ids: 目标CPU block IDs列表
        """
        if not cpu_block_ids:
            return
        num_to_offload = min(len(cpu_block_ids), len(self.compute_slots))
        logger.debug(f"Compute offload: GPU {self.compute_slots[:num_to_offload]} -> CPU{cpu_block_ids[:num_to_offload]}")
        with torch.cuda.stream(self.transfer_stream_main):
            # 等待计算完成
            self.transfer_stream_main.wait_stream(self.compute_stream)
            for i in range(num_to_offload):
                gpu_slot = self.compute_slots[i]
                cpu_id = cpu_block_ids[i]
                self.k_cache_cpu[:, cpu_id].copy_(
                    self.k_cache_gpu[:, gpu_slot], non_blocking=True
                )
                self.v_cache_cpu[:, cpu_id].copy_(
                    self.v_cache_gpu[:, gpu_slot], non_blocking=True
                )
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -1,3 +1,4 @@
 import logging
 import torch
 from torch import nn
 import triton
@@ -6,6 +7,8 @@ import triton.language as tl
 from flash_attn.flash_attn_interface import flash_attn_varlen_func, flash_attn_with_kvcache
 from nanovllm.utils.context import get_context
 logger = logging.getLogger(__name__)
@triton.jit
 def store_kvcache_kernel(
@@ -97,13 +100,16 @@ class Attention(nn.Module):
        context,
    ) -> torch.Tensor:
        """
-        Compute attention with Ping-Pong dual buffer for chunked prefill.
+        Compute attention with 三区域 GPU buffer for chunked prefill.
        For chunked prefill:
-        1. Load previous KV from CPU using Ping-Pong (if any previous chunks)
+        1. Load previous KV from CPU using Compute/Prefetch区 (if any previous chunks)
        2. Compute attention against previous KV chunks (no causal mask)
        3. Compute attention against current chunk's KV (causal)
        4. Merge all results using online softmax
        三区域设计保证：当前chunk的KV在Compute区，previous KV从CPU加载到Prefetch区，
        不会发生写入和加载区域重叠的问题。
        """
        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
@@ -116,7 +122,7 @@ class Attention(nn.Module):
        o_acc = None
        lse_acc = None
-        # Load previous KV from CPU using Ping-Pong
+        # Load previous KV from CPU using Compute/Prefetch区
        # Note: context.offload_engine is actually HybridKVCacheManager
        kvcache_manager = context.offload_engine
        seq = context.chunked_seq if hasattr(context, 'chunked_seq') else None
@@ -127,41 +133,42 @@ class Attention(nn.Module):
            if cpu_block_table:
                offload_engine = kvcache_manager.offload_engine
-                ping_size = offload_engine.ping_size
+                # 使用 Prefetch区 来加载 previous KV（不会与当前 Compute区 冲突）
-                num_chunks = (len(cpu_block_table) + ping_size - 1) // ping_size
+                prefetch_size = offload_engine.num_prefetch_blocks
-                current_buffer = "ping"
+                num_chunks = (len(cpu_block_table) + prefetch_size - 1) // prefetch_size
                use_compute = True  # 交替使用 Compute区 和 Prefetch区
-                # Prefetch first chunk to Ping buffer
+                # 首先将 previous KV 加载到 Prefetch区
-                first_chunk_end = min(ping_size, len(cpu_block_table))
+                # Only layer 0 triggers the load (loads ALL layers at once)
                first_chunk_end = min(prefetch_size, len(cpu_block_table))
                first_chunk_ids = cpu_block_table[:first_chunk_end]
-                offload_engine.load_to_ping(first_chunk_ids)
+                if self.layer_id == 0:
                    offload_engine.load_to_prefetch(first_chunk_ids)
                for chunk_idx in range(num_chunks):
-                    start = chunk_idx * ping_size
+                    start = chunk_idx * prefetch_size
-                    end = min(start + ping_size, len(cpu_block_table))
+                    end = min(start + prefetch_size, len(cpu_block_table))
                    num_blocks_in_chunk = end - start
-                    # Prefetch next chunk to OTHER buffer
+                    # Prefetch next chunk to other buffer (if exists)
-                    if chunk_idx + 1 < num_chunks:
+                    # Only layer 0 triggers the load
                    if chunk_idx + 1 < num_chunks and self.layer_id == 0:
                        next_start = end
-                        next_end = min(next_start + ping_size, len(cpu_block_table))
+                        next_end = min(next_start + prefetch_size, len(cpu_block_table))
                        next_chunk_ids = cpu_block_table[next_start:next_end]
-                        if current_buffer == "ping":
+                        if use_compute:
-                            offload_engine.load_to_pong(next_chunk_ids)
+                            # 当前在 Prefetch区，下一个加载到 Compute区（如果有空间）
                            # 注意：Compute区 此时已写入当前chunk的KV，不能覆盖
                            # 所以这里我们使用简单的同步策略：等待当前完成后再加载
                            pass  # 简化版本：不进行双缓冲，只用 Prefetch区
                        else:
-                            offload_engine.load_to_ping(next_chunk_ids)
+                            offload_engine.load_to_prefetch(next_chunk_ids)
-                    # Wait for current buffer and get KV
+                    # Wait for Prefetch区 and get KV
-                    if current_buffer == "ping":
+                    offload_engine.wait_prefetch()
-                        offload_engine.wait_ping()
+                    prev_k, prev_v = offload_engine.get_kv_for_prefetch(
-                        prev_k, prev_v = offload_engine.get_kv_for_ping_slots(
+                        self.layer_id, num_blocks_in_chunk
-                            self.layer_id, num_blocks_in_chunk
+                    )
                        )
                    else:
                        offload_engine.wait_pong()
                        prev_k, prev_v = offload_engine.get_kv_for_pong_slots(
                            self.layer_id, num_blocks_in_chunk
                        )
                    # Compute attention against this chunk (no causal mask)
                    prev_o, prev_lse = flash_attn_with_lse(
@@ -178,8 +185,12 @@ class Attention(nn.Module):
                    else:
                        o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, prev_o, prev_lse)
-                    # Switch buffer
+                    # Load next chunk to Prefetch区 (if exists)
-                    current_buffer = "pong" if current_buffer == "ping" else "ping"
+                    if chunk_idx + 1 < num_chunks and self.layer_id == 0:
                        next_start = end
                        next_end = min(next_start + prefetch_size, len(cpu_block_table))
                        next_chunk_ids = cpu_block_table[next_start:next_end]
                        offload_engine.load_to_prefetch(next_chunk_ids)
        # Compute attention against current chunk's KV (with causal mask)
        current_o, current_lse = flash_attn_with_lse(
@@ -207,13 +218,16 @@ class Attention(nn.Module):
        context,
    ) -> torch.Tensor:
        """
-        Compute decode attention with Ping-Pong dual buffer.
+        Compute decode attention with 三区域 GPU buffer.
-        All KV is stored on CPU. Uses Ping-Pong buffers on GPU:
+        All KV is stored on CPU. Uses Compute区 buffer on GPU:
-        1. Load first chunk to Ping buffer
+        1. Load chunk to Compute区
-        2. While computing on current buffer, prefetch next chunk to other buffer
+        2. Compute attention
-        3. Alternate between Ping and Pong buffers
+        3. Repeat for all chunks
-        4. Merge attention outputs using online softmax (LSE)
+        4. Finally, attend to Decode区 (slot 0) which contains the new token's KV
        5. Merge all attention outputs using online softmax (LSE)
        关键：新token的KV在Decode区(slot 0)，不会被Compute区的加载覆盖。
        """
        from nanovllm.kvcache.chunked_attention import flash_attn_with_lse, merge_attention_outputs
@@ -227,51 +241,37 @@ class Attention(nn.Module):
        # Get all CPU blocks for this sequence
        cpu_block_table, _ = kvcache_manager.get_all_cpu_blocks(seq)
        if self.layer_id == 0:
            logger.debug(f"Decode attention: cpu_block_table={cpu_block_table}, seq.block_table={list(seq.block_table)}")
        if not cpu_block_table:
            raise RuntimeError("Chunked decode attention failed: no CPU blocks available")
-        # Get the actual offload_engine for Ping-Pong operations
+        # Get the actual offload_engine for 三区域 operations
        offload_engine = kvcache_manager.offload_engine
-        # Calculate chunk info
+        # Calculate chunk info using Compute区
-        ping_size = offload_engine.ping_size
+        compute_size = offload_engine.num_compute_blocks
-        num_chunks = (len(cpu_block_table) + ping_size - 1) // ping_size
+        num_chunks = (len(cpu_block_table) + compute_size - 1) // compute_size
        o_acc = None
        lse_acc = None
        current_buffer = "ping"
        # Prefetch first chunk to Ping buffer (loads all layers at once)
        first_chunk_end = min(ping_size, len(cpu_block_table))
        first_chunk_ids = cpu_block_table[:first_chunk_end]
        offload_engine.load_to_ping(first_chunk_ids)
        for chunk_idx in range(num_chunks):
-            start = chunk_idx * ping_size
+            start = chunk_idx * compute_size
-            end = min(start + ping_size, len(cpu_block_table))
+            end = min(start + compute_size, len(cpu_block_table))
            num_blocks_in_chunk = end - start
            chunk_ids = cpu_block_table[start:end]
-            # Prefetch next chunk to OTHER buffer (overlapped with current computation)
+            # Load this chunk to Compute区
-            if chunk_idx + 1 < num_chunks:
+            # Only layer 0 triggers the load (loads ALL layers at once)
-                next_start = end
+            if self.layer_id == 0:
-                next_end = min(next_start + ping_size, len(cpu_block_table))
+                offload_engine.load_to_compute(chunk_ids)
                next_chunk_ids = cpu_block_table[next_start:next_end]
                if current_buffer == "ping":
                    offload_engine.load_to_pong(next_chunk_ids)
                else:
                    offload_engine.load_to_ping(next_chunk_ids)
-            # Wait for current buffer to be ready and get KV
+            # Wait for Compute区 to be ready and get KV
-            if current_buffer == "ping":
+            offload_engine.wait_compute()
-                offload_engine.wait_ping()
+            k_chunk, v_chunk = offload_engine.get_kv_for_compute(
-                k_chunk, v_chunk = offload_engine.get_kv_for_ping_slots(
+                self.layer_id, num_blocks_in_chunk
-                    self.layer_id, num_blocks_in_chunk
+            )
                )
            else:
                offload_engine.wait_pong()
                k_chunk, v_chunk = offload_engine.get_kv_for_pong_slots(
                    self.layer_id, num_blocks_in_chunk
                )
            # Compute attention for this chunk
            o_chunk, lse_chunk = flash_attn_with_lse(
@@ -286,8 +286,21 @@ class Attention(nn.Module):
            else:
                o_acc, lse_acc = merge_attention_outputs(o_acc, lse_acc, o_chunk, lse_chunk)
-            # Switch buffer for next iteration
+        # Now attend to Decode区 (contains the new token's KV)
-            current_buffer = "pong" if current_buffer == "ping" else "ping"
+        # This is the token being decoded - only 1 token at position pos_in_block
        pos_in_block = context.decode_pos_in_block
        decode_k, decode_v = offload_engine.get_kv_for_decode_slot(self.layer_id, pos_in_block)
        decode_o, decode_lse = flash_attn_with_lse(
            q_batched, decode_k, decode_v,
            softmax_scale=self.scale,
            causal=False,
        )
        # Merge with accumulated
        if o_acc is None:
            o_acc = decode_o
        else:
            o_acc, _ = merge_attention_outputs(o_acc, lse_acc, decode_o, decode_lse)
        if o_acc is None:
            raise RuntimeError("Chunked decode attention failed: no KV available")
--- a/nanovllm/utils/context.py
+++ b/nanovllm/utils/context.py
@@ -27,6 +27,8 @@ class Context:
    prev_kv_chunks: List[Tuple[torch.Tensor, torch.Tensor]] = field(default_factory=list)
    # Current sequence being processed (for chunked prefill to load KV)
    chunked_seq: Any = None
    # Position within block for decode (used for reading from Decode区)
    decode_pos_in_block: int = 0
 _CONTEXT = Context()
@@ -50,6 +52,7 @@ def set_context(
    chunk_offset=0,
    offload_engine=None,
    chunked_seq=None,
    decode_pos_in_block=0,
 ):
    global _CONTEXT
    _CONTEXT = Context(
@@ -66,6 +69,7 @@ def set_context(
        chunk_offset=chunk_offset,
        offload_engine=offload_engine,
        chunked_seq=chunked_seq,
        decode_pos_in_block=decode_pos_in_block,
    )