[feat] Added chunked prefill and kvcache offload mechenism.

2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions
--- a/nanovllm/kvcache/policies/init.py
+++ b/nanovllm/kvcache/policies/init.py
@@ -0,0 +1,51 @@
+"""
+Eviction policy plugins for KV cache offloading.
+
+Users can create custom policies by subclassing EvictionPolicy
+and specifying the full class path in config.offload_policy.
+"""
+
+from nanovllm.kvcache.policies.base_policy import EvictionPolicy
+from nanovllm.kvcache.policies.lru_policy import LRUPolicy
+from nanovllm.kvcache.policies.fifo_policy import FIFOPolicy
+
+# Built-in policy registry
+BUILTIN_POLICIES = {
+    "lru": LRUPolicy,
+    "fifo": FIFOPolicy,
+}
+
+
+def get_policy(policy_name: str) -> EvictionPolicy:
+    """
+    Get an eviction policy instance by name or class path.
+
+    Args:
+        policy_name: Either a built-in name ("lru", "fifo") or
+                    a full class path ("mymodule.MyPolicy")
+
+    Returns:
+        EvictionPolicy instance
+    """
+    # Check built-in policies first
+    if policy_name.lower() in BUILTIN_POLICIES:
+        return BUILTIN_POLICIES[policy_name.lower()]()
+
+    # Try to import custom policy
+    try:
+        module_path, class_name = policy_name.rsplit(".", 1)
+        import importlib
+        module = importlib.import_module(module_path)
+        policy_class = getattr(module, class_name)
+        if not issubclass(policy_class, EvictionPolicy):
+            raise TypeError(f"{policy_name} is not a subclass of EvictionPolicy")
+        return policy_class()
+    except (ValueError, ImportError, AttributeError) as e:
+        raise ValueError(
+            f"Unknown policy '{policy_name}'. "
+            f"Available built-in policies: {list(BUILTIN_POLICIES.keys())}. "
+            f"For custom policies, use full class path: 'mymodule.MyPolicy'"
+        ) from e
+
+
+__all__ = ["EvictionPolicy", "LRUPolicy", "FIFOPolicy", "get_policy", "BUILTIN_POLICIES"]
--- a/nanovllm/kvcache/policies/base_policy.py
+++ b/nanovllm/kvcache/policies/base_policy.py
@@ -0,0 +1,156 @@
+"""
+Base class for eviction policies.
+
+Users can implement custom policies by subclassing EvictionPolicy
+and overriding the abstract methods.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Set, Optional
+
+
+class EvictionPolicy(ABC):
+    """
+    Abstract base class for KV cache eviction policies.
+
+    An eviction policy determines which GPU blocks to evict to CPU
+    when GPU memory is full and new blocks need to be allocated.
+
+    Lifecycle:
+    1. on_block_allocated() - called when a new block is allocated
+    2. on_block_access() - called each time a block is accessed (e.g., in attention)
+    3. select_victim() - called when a block needs to be evicted
+    4. on_block_evicted() - called after a block is evicted
+
+    Example custom policy:
+    ```python
+    class MyCustomPolicy(EvictionPolicy):
+        def __init__(self):
+            self.priorities = {}
+
+        def on_block_allocated(self, block_id: int, step: int):
+            self.priorities[block_id] = step
+
+        def on_block_access(self, block_id: int, step: int):
+            # Custom access tracking
+            pass
+
+        def select_victim(self, candidates: Set[int]) -> int:
+            # Return block with lowest priority
+            return min(candidates, key=lambda b: self.priorities.get(b, 0))
+
+        def on_block_evicted(self, block_id: int):
+            self.priorities.pop(block_id, None)
+    ```
+    """
+
+    @abstractmethod
+    def on_block_allocated(self, block_id: int, step: int) -> None:
+        """
+        Called when a new block is allocated on GPU.
+
+        Args:
+            block_id: The GPU block ID that was allocated
+            step: Current inference step (monotonically increasing)
+        """
+        pass
+
+    @abstractmethod
+    def on_block_access(self, block_id: int, step: int) -> None:
+        """
+        Called when a block is accessed during attention computation.
+
+        Args:
+            block_id: The GPU block ID being accessed
+            step: Current inference step
+        """
+        pass
+
+    @abstractmethod
+    def select_victim(self, candidates: Set[int]) -> int:
+        """
+        Select a block to evict from the candidate set.
+
+        This is called when GPU memory is full and a new block
+        needs to be allocated. The returned block will be evicted
+        to CPU.
+
+        Args:
+            candidates: Set of GPU block IDs that can be evicted
+                       (blocks not currently being used)
+
+        Returns:
+            Block ID to evict
+
+        Raises:
+            ValueError: If candidates is empty
+        """
+        pass
+
+    @abstractmethod
+    def on_block_evicted(self, block_id: int) -> None:
+        """
+        Called after a block is evicted from GPU to CPU.
+
+        Args:
+            block_id: The GPU block ID that was evicted
+        """
+        pass
+
+    def on_block_prefetched(self, block_id: int, step: int) -> None:
+        """
+        Called when a block is prefetched from CPU back to GPU.
+
+        Default implementation calls on_block_allocated().
+        Override for custom behavior.
+
+        Args:
+            block_id: The GPU block ID that was prefetched to
+            step: Current inference step
+        """
+        self.on_block_allocated(block_id, step)
+
+    def on_block_deallocated(self, block_id: int) -> None:
+        """
+        Called when a block is fully deallocated (sequence finished).
+
+        Default implementation calls on_block_evicted().
+        Override for custom behavior.
+
+        Args:
+            block_id: The GPU block ID being deallocated
+        """
+        self.on_block_evicted(block_id)
+
+    def reset(self) -> None:
+        """
+        Reset policy state.
+
+        Called when the inference engine is reset.
+        Default implementation does nothing.
+        """
+        pass
+
+    def get_eviction_order(self, candidates: Set[int], count: int) -> list:
+        """
+        Get multiple blocks to evict in order of priority.
+
+        Default implementation calls select_victim() repeatedly.
+        Override for more efficient batch selection.
+
+        Args:
+            candidates: Set of candidate block IDs
+            count: Number of blocks to evict
+
+        Returns:
+            List of block IDs to evict, in order
+        """
+        result = []
+        remaining = set(candidates)
+        for _ in range(min(count, len(remaining))):
+            if not remaining:
+                break
+            victim = self.select_victim(remaining)
+            result.append(victim)
+            remaining.remove(victim)
+        return result
--- a/nanovllm/kvcache/policies/fifo_policy.py
+++ b/nanovllm/kvcache/policies/fifo_policy.py
@@ -0,0 +1,101 @@
+"""
+FIFO (First In, First Out) eviction policy.
+
+Evicts the block that was allocated earliest.
+Simple policy that ignores access patterns.
+"""
+
+from collections import OrderedDict
+from typing import Set
+
+from nanovllm.kvcache.policies.base_policy import EvictionPolicy
+
+
+class FIFOPolicy(EvictionPolicy):
+    """
+    First In, First Out (FIFO) eviction policy.
+
+    Evicts blocks in the order they were allocated,
+    regardless of access patterns.
+
+    Properties:
+    - O(1) operations for all methods
+    - Simple and predictable behavior
+    - Good for streaming workloads where older data
+      is naturally less relevant
+    - Does not adapt to access patterns (unlike LRU)
+    """
+
+    def __init__(self):
+        # OrderedDict maintains insertion order
+        # Key: block_id, Value: allocation_step
+        # Oldest (first allocated) is at the front
+        self.allocation_order: OrderedDict[int, int] = OrderedDict()
+
+    def on_block_allocated(self, block_id: int, step: int) -> None:
+        """Record allocation order (does not change on access)."""
+        if block_id not in self.allocation_order:
+            self.allocation_order[block_id] = step
+
+    def on_block_access(self, block_id: int, step: int) -> None:
+        """
+        FIFO ignores access patterns.
+
+        This is the key difference from LRU - we don't
+        update the position based on access.
+        """
+        pass  # Intentionally empty
+
+    def select_victim(self, candidates: Set[int]) -> int:
+        """
+        Select the earliest allocated block from candidates.
+        """
+        if not candidates:
+            raise ValueError("Cannot select victim from empty candidate set")
+
+        # Iterate from oldest (front) to newest (back)
+        for block_id in self.allocation_order:
+            if block_id in candidates:
+                return block_id
+
+        # Fallback: return any candidate
+        return next(iter(candidates))
+
+    def on_block_evicted(self, block_id: int) -> None:
+        """Remove block from tracking."""
+        self.allocation_order.pop(block_id, None)
+
+    def on_block_prefetched(self, block_id: int, step: int) -> None:
+        """
+        When prefetched, treat as new allocation.
+
+        This moves the block to the end of the queue,
+        giving it more time before eviction.
+        """
+        # Remove old entry if exists
+        self.allocation_order.pop(block_id, None)
+        # Add as new allocation
+        self.allocation_order[block_id] = step
+
+    def on_block_deallocated(self, block_id: int) -> None:
+        """Remove block from tracking."""
+        self.allocation_order.pop(block_id, None)
+
+    def reset(self) -> None:
+        """Clear all tracking data."""
+        self.allocation_order.clear()
+
+    def get_eviction_order(self, candidates: Set[int], count: int) -> list:
+        """
+        Get multiple blocks to evict in FIFO order.
+        """
+        result = []
+        for block_id in self.allocation_order:
+            if block_id in candidates:
+                result.append(block_id)
+                if len(result) >= count:
+                    break
+        return result
+
+    def __repr__(self) -> str:
+        return f"FIFOPolicy(tracked_blocks={len(self.allocation_order)})"
--- a/nanovllm/kvcache/policies/lru_policy.py
+++ b/nanovllm/kvcache/policies/lru_policy.py
@@ -0,0 +1,93 @@
+"""
+LRU (Least Recently Used) eviction policy.
+
+Evicts the block that was accessed least recently.
+This is the default and recommended policy for most use cases.
+"""
+
+from collections import OrderedDict
+from typing import Set
+
+from nanovllm.kvcache.policies.base_policy import EvictionPolicy
+
+
+class LRUPolicy(EvictionPolicy):
+    """
+    Least Recently Used (LRU) eviction policy.
+
+    Maintains an ordered dictionary of block access times.
+    When eviction is needed, selects the block that was
+    accessed least recently.
+
+    Properties:
+    - O(1) access tracking
+    - O(n) victim selection in worst case, but typically fast
+      due to OrderedDict iteration order
+    - Good for workloads with temporal locality
+    """
+
+    def __init__(self):
+        # OrderedDict maintains insertion/update order
+        # Key: block_id, Value: last_access_step
+        # Oldest (least recently used) is at the front
+        self.access_order: OrderedDict[int, int] = OrderedDict()
+
+    def on_block_allocated(self, block_id: int, step: int) -> None:
+        """Record allocation as an access."""
+        # Move to end (most recently used)
+        self.access_order[block_id] = step
+        self.access_order.move_to_end(block_id)
+
+    def on_block_access(self, block_id: int, step: int) -> None:
+        """Update access time and move to end."""
+        if block_id in self.access_order:
+            self.access_order[block_id] = step
+            self.access_order.move_to_end(block_id)
+
+    def select_victim(self, candidates: Set[int]) -> int:
+        """
+        Select the least recently used block from candidates.
+
+        Iterates from oldest to newest in access order,
+        returns the first one that's in the candidate set.
+        """
+        if not candidates:
+            raise ValueError("Cannot select victim from empty candidate set")
+
+        # Iterate from oldest (front) to newest (back)
+        for block_id in self.access_order:
+            if block_id in candidates:
+                return block_id
+
+        # Fallback: return any candidate (shouldn't happen normally)
+        return next(iter(candidates))
+
+    def on_block_evicted(self, block_id: int) -> None:
+        """Remove block from tracking."""
+        self.access_order.pop(block_id, None)
+
+    def on_block_deallocated(self, block_id: int) -> None:
+        """Remove block from tracking."""
+        self.access_order.pop(block_id, None)
+
+    def reset(self) -> None:
+        """Clear all tracking data."""
+        self.access_order.clear()
+
+    def get_eviction_order(self, candidates: Set[int], count: int) -> list:
+        """
+        Efficiently get multiple blocks to evict in LRU order.
+
+        Optimized for batch eviction - iterates through access_order
+        once instead of calling select_victim() multiple times.
+        """
+        result = []
+        for block_id in self.access_order:
+            if block_id in candidates:
+                result.append(block_id)
+                if len(result) >= count:
+                    break
+        return result
+
+    def __repr__(self) -> str:
+        return f"LRUPolicy(tracked_blocks={len(self.access_order)})"