[feat] Added chunked prefill and kvcache offload mechenism.

2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions
--- a/nanovllm/kvcache/base_manager.py
+++ b/nanovllm/kvcache/base_manager.py
@@ -0,0 +1,260 @@
+"""
+Abstract base class for KV cache managers.
+
+This interface allows pluggable implementations:
+- GPUOnlyManager: Pure GPU (current nano-vllm behavior)
+- HybridKVCacheManager: CPU offload with CUDA Graph support
+- Future: Disk offload, distributed cache, etc.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Tuple, Optional
+import torch
+from torch import Tensor
+
+from nanovllm.engine.sequence import Sequence
+
+
+class KVCacheManager(ABC):
+    """
+    Abstract base class for KV cache management strategies.
+
+    A KVCacheManager handles:
+    1. Physical memory allocation (GPU and optionally CPU)
+    2. Logical block management (allocation, deallocation, prefix caching)
+    3. Data transfer between devices (for hybrid managers)
+    4. Integration with CUDA graphs
+
+    Key design principles:
+    - Sequences reference logical block IDs
+    - Physical block IDs (GPU slots) may differ from logical IDs
+    - CUDA Graph compatibility requires fixed tensor addresses
+    """
+
+    @property
+    @abstractmethod
+    def block_size(self) -> int:
+        """Number of tokens per block."""
+        pass
+
+    @property
+    @abstractmethod
+    def num_free_blocks(self) -> int:
+        """Number of free logical blocks available for allocation."""
+        pass
+
+    @abstractmethod
+    def allocate_cache(
+        self,
+        num_layers: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype,
+    ) -> None:
+        """
+        Allocate KV cache storage.
+
+        Called once during initialization to allocate GPU (and optionally CPU)
+        memory for the KV cache.
+
+        Args:
+            num_layers: Number of transformer layers
+            num_kv_heads: Number of key-value heads per layer
+            head_dim: Dimension per head
+            dtype: Data type for cache (e.g., torch.float16)
+        """
+        pass
+
+    @abstractmethod
+    def get_layer_cache(self, layer_id: int) -> Tuple[Tensor, Tensor]:
+        """
+        Get K and V cache tensors for a specific layer.
+
+        The returned tensors must be on GPU and have fixed addresses
+        for CUDA Graph compatibility.
+
+        Args:
+            layer_id: Layer index
+
+        Returns:
+            (k_cache, v_cache) tensors
+            Shape depends on implementation, typically:
+            [num_blocks, block_size, kv_heads, head_dim]
+        """
+        pass
+
+    @abstractmethod
+    def can_allocate(self, seq: Sequence) -> bool:
+        """
+        Check if blocks can be allocated for a new sequence.
+
+        Called before allocate() to ensure sufficient resources.
+
+        Args:
+            seq: Sequence to check
+
+        Returns:
+            True if allocation is possible
+        """
+        pass
+
+    @abstractmethod
+    def allocate(self, seq: Sequence) -> None:
+        """
+        Allocate blocks for a sequence during prefill.
+
+        This method:
+        1. Checks prefix cache for matching blocks
+        2. Allocates new blocks as needed
+        3. Updates seq.block_table with logical block IDs
+        4. Updates seq.num_cached_tokens for prefix cache hits
+
+        Args:
+            seq: Sequence to allocate blocks for
+        """
+        pass
+
+    @abstractmethod
+    def deallocate(self, seq: Sequence) -> None:
+        """
+        Release blocks for a finished sequence.
+
+        This method:
+        1. Decrements reference counts
+        2. Frees blocks with zero references
+        3. Clears seq.block_table
+
+        Args:
+            seq: Sequence whose blocks to release
+        """
+        pass
+
+    @abstractmethod
+    def can_append(self, seq: Sequence) -> bool:
+        """
+        Check if a new block can be allocated for decode.
+
+        Called before may_append() to check if resources are available.
+
+        Args:
+            seq: Sequence to check
+
+        Returns:
+            True if append is possible (or no new block needed)
+        """
+        pass
+
+    @abstractmethod
+    def may_append(self, seq: Sequence) -> None:
+        """
+        Potentially allocate a new block during decode.
+
+        Called after each decode step. If the current block is full,
+        allocates a new block and updates seq.block_table.
+
+        Args:
+            seq: Sequence that may need a new block
+        """
+        pass
+
+    @abstractmethod
+    def prepare_for_attention(
+        self,
+        seqs: List[Sequence],
+        is_prefill: bool,
+    ) -> None:
+        """
+        Prepare KV cache for attention computation.
+
+        For GPU-only managers: typically a no-op.
+        For hybrid managers: ensures all needed blocks are on GPU,
+        may trigger prefetching from CPU.
+
+        Called before attention computation. For decode with CUDA graphs,
+        this should update gather_indices but not perform actual transfers
+        (transfers happen inside the graph).
+
+        Args:
+            seqs: Sequences that will be processed
+            is_prefill: True for prefill phase, False for decode
+        """
+        pass
+
+    @abstractmethod
+    def get_gpu_block_tables(
+        self,
+        seqs: List[Sequence],
+    ) -> List[List[int]]:
+        """
+        Get GPU physical block tables for sequences.
+
+        For GPU-only managers: returns seq.block_table directly.
+        For hybrid managers: returns GPU slot IDs (may differ from logical IDs).
+
+        The returned block tables are used to compute slot_mapping
+        in ModelRunner.prepare_prefill/decode.
+
+        Args:
+            seqs: Sequences to get block tables for
+
+        Returns:
+            List of GPU block tables, one per sequence
+        """
+        pass
+
+    def post_attention_cleanup(
+        self,
+        seqs: List[Sequence],
+        is_prefill: bool,
+    ) -> None:
+        """
+        Cleanup after attention computation.
+
+        Optional hook for managers to perform post-attention tasks:
+        - Offloading cold blocks to CPU
+        - Updating access statistics
+        - etc.
+
+        Default implementation does nothing.
+
+        Args:
+            seqs: Sequences that were processed
+            is_prefill: True for prefill phase, False for decode
+        """
+        pass
+
+    def get_num_blocks_needed(self, num_tokens: int) -> int:
+        """
+        Calculate number of blocks needed for given token count.
+
+        Args:
+            num_tokens: Number of tokens
+
+        Returns:
+            Number of blocks needed
+        """
+        return (num_tokens + self.block_size - 1) // self.block_size
+
+    @staticmethod
+    def compute_hash(token_ids: list, prefix: int = -1) -> int:
+        """
+        Compute hash for prefix caching.
+
+        Uses xxhash for fast hashing. The hash includes the prefix hash
+        to create a chain of hashes for multi-block sequences.
+
+        Args:
+            token_ids: Token IDs in the block
+            prefix: Hash of previous block, or -1 for first block
+
+        Returns:
+            Hash value
+        """
+        import xxhash
+        import numpy as np
+
+        h = xxhash.xxh64()
+        if prefix != -1:
+            h.update(prefix.to_bytes(8, "little"))
+        h.update(np.array(token_ids).tobytes())
+        return h.intdigest()