[feat] Added chunked prefill and kvcache offload mechenism.

This commit is contained in:
Zijie Tian
2025-12-10 03:47:37 +08:00
parent 204fe2b38f
commit 0b6f19242d
25 changed files with 4414 additions and 61 deletions

View File

@@ -0,0 +1,51 @@
"""
Eviction policy plugins for KV cache offloading.
Users can create custom policies by subclassing EvictionPolicy
and specifying the full class path in config.offload_policy.
"""
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
from nanovllm.kvcache.policies.lru_policy import LRUPolicy
from nanovllm.kvcache.policies.fifo_policy import FIFOPolicy
# Built-in policy registry
BUILTIN_POLICIES = {
"lru": LRUPolicy,
"fifo": FIFOPolicy,
}
def get_policy(policy_name: str) -> EvictionPolicy:
"""
Get an eviction policy instance by name or class path.
Args:
policy_name: Either a built-in name ("lru", "fifo") or
a full class path ("mymodule.MyPolicy")
Returns:
EvictionPolicy instance
"""
# Check built-in policies first
if policy_name.lower() in BUILTIN_POLICIES:
return BUILTIN_POLICIES[policy_name.lower()]()
# Try to import custom policy
try:
module_path, class_name = policy_name.rsplit(".", 1)
import importlib
module = importlib.import_module(module_path)
policy_class = getattr(module, class_name)
if not issubclass(policy_class, EvictionPolicy):
raise TypeError(f"{policy_name} is not a subclass of EvictionPolicy")
return policy_class()
except (ValueError, ImportError, AttributeError) as e:
raise ValueError(
f"Unknown policy '{policy_name}'. "
f"Available built-in policies: {list(BUILTIN_POLICIES.keys())}. "
f"For custom policies, use full class path: 'mymodule.MyPolicy'"
) from e
__all__ = ["EvictionPolicy", "LRUPolicy", "FIFOPolicy", "get_policy", "BUILTIN_POLICIES"]

View File

@@ -0,0 +1,156 @@
"""
Base class for eviction policies.
Users can implement custom policies by subclassing EvictionPolicy
and overriding the abstract methods.
"""
from abc import ABC, abstractmethod
from typing import Set, Optional
class EvictionPolicy(ABC):
"""
Abstract base class for KV cache eviction policies.
An eviction policy determines which GPU blocks to evict to CPU
when GPU memory is full and new blocks need to be allocated.
Lifecycle:
1. on_block_allocated() - called when a new block is allocated
2. on_block_access() - called each time a block is accessed (e.g., in attention)
3. select_victim() - called when a block needs to be evicted
4. on_block_evicted() - called after a block is evicted
Example custom policy:
```python
class MyCustomPolicy(EvictionPolicy):
def __init__(self):
self.priorities = {}
def on_block_allocated(self, block_id: int, step: int):
self.priorities[block_id] = step
def on_block_access(self, block_id: int, step: int):
# Custom access tracking
pass
def select_victim(self, candidates: Set[int]) -> int:
# Return block with lowest priority
return min(candidates, key=lambda b: self.priorities.get(b, 0))
def on_block_evicted(self, block_id: int):
self.priorities.pop(block_id, None)
```
"""
@abstractmethod
def on_block_allocated(self, block_id: int, step: int) -> None:
"""
Called when a new block is allocated on GPU.
Args:
block_id: The GPU block ID that was allocated
step: Current inference step (monotonically increasing)
"""
pass
@abstractmethod
def on_block_access(self, block_id: int, step: int) -> None:
"""
Called when a block is accessed during attention computation.
Args:
block_id: The GPU block ID being accessed
step: Current inference step
"""
pass
@abstractmethod
def select_victim(self, candidates: Set[int]) -> int:
"""
Select a block to evict from the candidate set.
This is called when GPU memory is full and a new block
needs to be allocated. The returned block will be evicted
to CPU.
Args:
candidates: Set of GPU block IDs that can be evicted
(blocks not currently being used)
Returns:
Block ID to evict
Raises:
ValueError: If candidates is empty
"""
pass
@abstractmethod
def on_block_evicted(self, block_id: int) -> None:
"""
Called after a block is evicted from GPU to CPU.
Args:
block_id: The GPU block ID that was evicted
"""
pass
def on_block_prefetched(self, block_id: int, step: int) -> None:
"""
Called when a block is prefetched from CPU back to GPU.
Default implementation calls on_block_allocated().
Override for custom behavior.
Args:
block_id: The GPU block ID that was prefetched to
step: Current inference step
"""
self.on_block_allocated(block_id, step)
def on_block_deallocated(self, block_id: int) -> None:
"""
Called when a block is fully deallocated (sequence finished).
Default implementation calls on_block_evicted().
Override for custom behavior.
Args:
block_id: The GPU block ID being deallocated
"""
self.on_block_evicted(block_id)
def reset(self) -> None:
"""
Reset policy state.
Called when the inference engine is reset.
Default implementation does nothing.
"""
pass
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
"""
Get multiple blocks to evict in order of priority.
Default implementation calls select_victim() repeatedly.
Override for more efficient batch selection.
Args:
candidates: Set of candidate block IDs
count: Number of blocks to evict
Returns:
List of block IDs to evict, in order
"""
result = []
remaining = set(candidates)
for _ in range(min(count, len(remaining))):
if not remaining:
break
victim = self.select_victim(remaining)
result.append(victim)
remaining.remove(victim)
return result

View File

@@ -0,0 +1,101 @@
"""
FIFO (First In, First Out) eviction policy.
Evicts the block that was allocated earliest.
Simple policy that ignores access patterns.
"""
from collections import OrderedDict
from typing import Set
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
class FIFOPolicy(EvictionPolicy):
"""
First In, First Out (FIFO) eviction policy.
Evicts blocks in the order they were allocated,
regardless of access patterns.
Properties:
- O(1) operations for all methods
- Simple and predictable behavior
- Good for streaming workloads where older data
is naturally less relevant
- Does not adapt to access patterns (unlike LRU)
"""
def __init__(self):
# OrderedDict maintains insertion order
# Key: block_id, Value: allocation_step
# Oldest (first allocated) is at the front
self.allocation_order: OrderedDict[int, int] = OrderedDict()
def on_block_allocated(self, block_id: int, step: int) -> None:
"""Record allocation order (does not change on access)."""
if block_id not in self.allocation_order:
self.allocation_order[block_id] = step
def on_block_access(self, block_id: int, step: int) -> None:
"""
FIFO ignores access patterns.
This is the key difference from LRU - we don't
update the position based on access.
"""
pass # Intentionally empty
def select_victim(self, candidates: Set[int]) -> int:
"""
Select the earliest allocated block from candidates.
"""
if not candidates:
raise ValueError("Cannot select victim from empty candidate set")
# Iterate from oldest (front) to newest (back)
for block_id in self.allocation_order:
if block_id in candidates:
return block_id
# Fallback: return any candidate
return next(iter(candidates))
def on_block_evicted(self, block_id: int) -> None:
"""Remove block from tracking."""
self.allocation_order.pop(block_id, None)
def on_block_prefetched(self, block_id: int, step: int) -> None:
"""
When prefetched, treat as new allocation.
This moves the block to the end of the queue,
giving it more time before eviction.
"""
# Remove old entry if exists
self.allocation_order.pop(block_id, None)
# Add as new allocation
self.allocation_order[block_id] = step
def on_block_deallocated(self, block_id: int) -> None:
"""Remove block from tracking."""
self.allocation_order.pop(block_id, None)
def reset(self) -> None:
"""Clear all tracking data."""
self.allocation_order.clear()
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
"""
Get multiple blocks to evict in FIFO order.
"""
result = []
for block_id in self.allocation_order:
if block_id in candidates:
result.append(block_id)
if len(result) >= count:
break
return result
def __repr__(self) -> str:
return f"FIFOPolicy(tracked_blocks={len(self.allocation_order)})"

View File

@@ -0,0 +1,93 @@
"""
LRU (Least Recently Used) eviction policy.
Evicts the block that was accessed least recently.
This is the default and recommended policy for most use cases.
"""
from collections import OrderedDict
from typing import Set
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
class LRUPolicy(EvictionPolicy):
"""
Least Recently Used (LRU) eviction policy.
Maintains an ordered dictionary of block access times.
When eviction is needed, selects the block that was
accessed least recently.
Properties:
- O(1) access tracking
- O(n) victim selection in worst case, but typically fast
due to OrderedDict iteration order
- Good for workloads with temporal locality
"""
def __init__(self):
# OrderedDict maintains insertion/update order
# Key: block_id, Value: last_access_step
# Oldest (least recently used) is at the front
self.access_order: OrderedDict[int, int] = OrderedDict()
def on_block_allocated(self, block_id: int, step: int) -> None:
"""Record allocation as an access."""
# Move to end (most recently used)
self.access_order[block_id] = step
self.access_order.move_to_end(block_id)
def on_block_access(self, block_id: int, step: int) -> None:
"""Update access time and move to end."""
if block_id in self.access_order:
self.access_order[block_id] = step
self.access_order.move_to_end(block_id)
def select_victim(self, candidates: Set[int]) -> int:
"""
Select the least recently used block from candidates.
Iterates from oldest to newest in access order,
returns the first one that's in the candidate set.
"""
if not candidates:
raise ValueError("Cannot select victim from empty candidate set")
# Iterate from oldest (front) to newest (back)
for block_id in self.access_order:
if block_id in candidates:
return block_id
# Fallback: return any candidate (shouldn't happen normally)
return next(iter(candidates))
def on_block_evicted(self, block_id: int) -> None:
"""Remove block from tracking."""
self.access_order.pop(block_id, None)
def on_block_deallocated(self, block_id: int) -> None:
"""Remove block from tracking."""
self.access_order.pop(block_id, None)
def reset(self) -> None:
"""Clear all tracking data."""
self.access_order.clear()
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
"""
Efficiently get multiple blocks to evict in LRU order.
Optimized for batch eviction - iterates through access_order
once instead of calling select_victim() multiple times.
"""
result = []
for block_id in self.access_order:
if block_id in candidates:
result.append(block_id)
if len(result) >= count:
break
return result
def __repr__(self) -> str:
return f"LRUPolicy(tracked_blocks={len(self.access_order)})"