[feat] Added chunked prefill and kvcache offload mechenism.
This commit is contained in:
51
nanovllm/kvcache/policies/__init__.py
Normal file
51
nanovllm/kvcache/policies/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""
|
||||
Eviction policy plugins for KV cache offloading.
|
||||
|
||||
Users can create custom policies by subclassing EvictionPolicy
|
||||
and specifying the full class path in config.offload_policy.
|
||||
"""
|
||||
|
||||
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
|
||||
from nanovllm.kvcache.policies.lru_policy import LRUPolicy
|
||||
from nanovllm.kvcache.policies.fifo_policy import FIFOPolicy
|
||||
|
||||
# Built-in policy registry
|
||||
BUILTIN_POLICIES = {
|
||||
"lru": LRUPolicy,
|
||||
"fifo": FIFOPolicy,
|
||||
}
|
||||
|
||||
|
||||
def get_policy(policy_name: str) -> EvictionPolicy:
|
||||
"""
|
||||
Get an eviction policy instance by name or class path.
|
||||
|
||||
Args:
|
||||
policy_name: Either a built-in name ("lru", "fifo") or
|
||||
a full class path ("mymodule.MyPolicy")
|
||||
|
||||
Returns:
|
||||
EvictionPolicy instance
|
||||
"""
|
||||
# Check built-in policies first
|
||||
if policy_name.lower() in BUILTIN_POLICIES:
|
||||
return BUILTIN_POLICIES[policy_name.lower()]()
|
||||
|
||||
# Try to import custom policy
|
||||
try:
|
||||
module_path, class_name = policy_name.rsplit(".", 1)
|
||||
import importlib
|
||||
module = importlib.import_module(module_path)
|
||||
policy_class = getattr(module, class_name)
|
||||
if not issubclass(policy_class, EvictionPolicy):
|
||||
raise TypeError(f"{policy_name} is not a subclass of EvictionPolicy")
|
||||
return policy_class()
|
||||
except (ValueError, ImportError, AttributeError) as e:
|
||||
raise ValueError(
|
||||
f"Unknown policy '{policy_name}'. "
|
||||
f"Available built-in policies: {list(BUILTIN_POLICIES.keys())}. "
|
||||
f"For custom policies, use full class path: 'mymodule.MyPolicy'"
|
||||
) from e
|
||||
|
||||
|
||||
__all__ = ["EvictionPolicy", "LRUPolicy", "FIFOPolicy", "get_policy", "BUILTIN_POLICIES"]
|
||||
156
nanovllm/kvcache/policies/base_policy.py
Normal file
156
nanovllm/kvcache/policies/base_policy.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""
|
||||
Base class for eviction policies.
|
||||
|
||||
Users can implement custom policies by subclassing EvictionPolicy
|
||||
and overriding the abstract methods.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Set, Optional
|
||||
|
||||
|
||||
class EvictionPolicy(ABC):
|
||||
"""
|
||||
Abstract base class for KV cache eviction policies.
|
||||
|
||||
An eviction policy determines which GPU blocks to evict to CPU
|
||||
when GPU memory is full and new blocks need to be allocated.
|
||||
|
||||
Lifecycle:
|
||||
1. on_block_allocated() - called when a new block is allocated
|
||||
2. on_block_access() - called each time a block is accessed (e.g., in attention)
|
||||
3. select_victim() - called when a block needs to be evicted
|
||||
4. on_block_evicted() - called after a block is evicted
|
||||
|
||||
Example custom policy:
|
||||
```python
|
||||
class MyCustomPolicy(EvictionPolicy):
|
||||
def __init__(self):
|
||||
self.priorities = {}
|
||||
|
||||
def on_block_allocated(self, block_id: int, step: int):
|
||||
self.priorities[block_id] = step
|
||||
|
||||
def on_block_access(self, block_id: int, step: int):
|
||||
# Custom access tracking
|
||||
pass
|
||||
|
||||
def select_victim(self, candidates: Set[int]) -> int:
|
||||
# Return block with lowest priority
|
||||
return min(candidates, key=lambda b: self.priorities.get(b, 0))
|
||||
|
||||
def on_block_evicted(self, block_id: int):
|
||||
self.priorities.pop(block_id, None)
|
||||
```
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def on_block_allocated(self, block_id: int, step: int) -> None:
|
||||
"""
|
||||
Called when a new block is allocated on GPU.
|
||||
|
||||
Args:
|
||||
block_id: The GPU block ID that was allocated
|
||||
step: Current inference step (monotonically increasing)
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def on_block_access(self, block_id: int, step: int) -> None:
|
||||
"""
|
||||
Called when a block is accessed during attention computation.
|
||||
|
||||
Args:
|
||||
block_id: The GPU block ID being accessed
|
||||
step: Current inference step
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def select_victim(self, candidates: Set[int]) -> int:
|
||||
"""
|
||||
Select a block to evict from the candidate set.
|
||||
|
||||
This is called when GPU memory is full and a new block
|
||||
needs to be allocated. The returned block will be evicted
|
||||
to CPU.
|
||||
|
||||
Args:
|
||||
candidates: Set of GPU block IDs that can be evicted
|
||||
(blocks not currently being used)
|
||||
|
||||
Returns:
|
||||
Block ID to evict
|
||||
|
||||
Raises:
|
||||
ValueError: If candidates is empty
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def on_block_evicted(self, block_id: int) -> None:
|
||||
"""
|
||||
Called after a block is evicted from GPU to CPU.
|
||||
|
||||
Args:
|
||||
block_id: The GPU block ID that was evicted
|
||||
"""
|
||||
pass
|
||||
|
||||
def on_block_prefetched(self, block_id: int, step: int) -> None:
|
||||
"""
|
||||
Called when a block is prefetched from CPU back to GPU.
|
||||
|
||||
Default implementation calls on_block_allocated().
|
||||
Override for custom behavior.
|
||||
|
||||
Args:
|
||||
block_id: The GPU block ID that was prefetched to
|
||||
step: Current inference step
|
||||
"""
|
||||
self.on_block_allocated(block_id, step)
|
||||
|
||||
def on_block_deallocated(self, block_id: int) -> None:
|
||||
"""
|
||||
Called when a block is fully deallocated (sequence finished).
|
||||
|
||||
Default implementation calls on_block_evicted().
|
||||
Override for custom behavior.
|
||||
|
||||
Args:
|
||||
block_id: The GPU block ID being deallocated
|
||||
"""
|
||||
self.on_block_evicted(block_id)
|
||||
|
||||
def reset(self) -> None:
|
||||
"""
|
||||
Reset policy state.
|
||||
|
||||
Called when the inference engine is reset.
|
||||
Default implementation does nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
|
||||
"""
|
||||
Get multiple blocks to evict in order of priority.
|
||||
|
||||
Default implementation calls select_victim() repeatedly.
|
||||
Override for more efficient batch selection.
|
||||
|
||||
Args:
|
||||
candidates: Set of candidate block IDs
|
||||
count: Number of blocks to evict
|
||||
|
||||
Returns:
|
||||
List of block IDs to evict, in order
|
||||
"""
|
||||
result = []
|
||||
remaining = set(candidates)
|
||||
for _ in range(min(count, len(remaining))):
|
||||
if not remaining:
|
||||
break
|
||||
victim = self.select_victim(remaining)
|
||||
result.append(victim)
|
||||
remaining.remove(victim)
|
||||
return result
|
||||
101
nanovllm/kvcache/policies/fifo_policy.py
Normal file
101
nanovllm/kvcache/policies/fifo_policy.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""
|
||||
FIFO (First In, First Out) eviction policy.
|
||||
|
||||
Evicts the block that was allocated earliest.
|
||||
Simple policy that ignores access patterns.
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import Set
|
||||
|
||||
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
|
||||
|
||||
|
||||
class FIFOPolicy(EvictionPolicy):
|
||||
"""
|
||||
First In, First Out (FIFO) eviction policy.
|
||||
|
||||
Evicts blocks in the order they were allocated,
|
||||
regardless of access patterns.
|
||||
|
||||
Properties:
|
||||
- O(1) operations for all methods
|
||||
- Simple and predictable behavior
|
||||
- Good for streaming workloads where older data
|
||||
is naturally less relevant
|
||||
- Does not adapt to access patterns (unlike LRU)
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# OrderedDict maintains insertion order
|
||||
# Key: block_id, Value: allocation_step
|
||||
# Oldest (first allocated) is at the front
|
||||
self.allocation_order: OrderedDict[int, int] = OrderedDict()
|
||||
|
||||
def on_block_allocated(self, block_id: int, step: int) -> None:
|
||||
"""Record allocation order (does not change on access)."""
|
||||
if block_id not in self.allocation_order:
|
||||
self.allocation_order[block_id] = step
|
||||
|
||||
def on_block_access(self, block_id: int, step: int) -> None:
|
||||
"""
|
||||
FIFO ignores access patterns.
|
||||
|
||||
This is the key difference from LRU - we don't
|
||||
update the position based on access.
|
||||
"""
|
||||
pass # Intentionally empty
|
||||
|
||||
def select_victim(self, candidates: Set[int]) -> int:
|
||||
"""
|
||||
Select the earliest allocated block from candidates.
|
||||
"""
|
||||
if not candidates:
|
||||
raise ValueError("Cannot select victim from empty candidate set")
|
||||
|
||||
# Iterate from oldest (front) to newest (back)
|
||||
for block_id in self.allocation_order:
|
||||
if block_id in candidates:
|
||||
return block_id
|
||||
|
||||
# Fallback: return any candidate
|
||||
return next(iter(candidates))
|
||||
|
||||
def on_block_evicted(self, block_id: int) -> None:
|
||||
"""Remove block from tracking."""
|
||||
self.allocation_order.pop(block_id, None)
|
||||
|
||||
def on_block_prefetched(self, block_id: int, step: int) -> None:
|
||||
"""
|
||||
When prefetched, treat as new allocation.
|
||||
|
||||
This moves the block to the end of the queue,
|
||||
giving it more time before eviction.
|
||||
"""
|
||||
# Remove old entry if exists
|
||||
self.allocation_order.pop(block_id, None)
|
||||
# Add as new allocation
|
||||
self.allocation_order[block_id] = step
|
||||
|
||||
def on_block_deallocated(self, block_id: int) -> None:
|
||||
"""Remove block from tracking."""
|
||||
self.allocation_order.pop(block_id, None)
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Clear all tracking data."""
|
||||
self.allocation_order.clear()
|
||||
|
||||
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
|
||||
"""
|
||||
Get multiple blocks to evict in FIFO order.
|
||||
"""
|
||||
result = []
|
||||
for block_id in self.allocation_order:
|
||||
if block_id in candidates:
|
||||
result.append(block_id)
|
||||
if len(result) >= count:
|
||||
break
|
||||
return result
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"FIFOPolicy(tracked_blocks={len(self.allocation_order)})"
|
||||
93
nanovllm/kvcache/policies/lru_policy.py
Normal file
93
nanovllm/kvcache/policies/lru_policy.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
LRU (Least Recently Used) eviction policy.
|
||||
|
||||
Evicts the block that was accessed least recently.
|
||||
This is the default and recommended policy for most use cases.
|
||||
"""
|
||||
|
||||
from collections import OrderedDict
|
||||
from typing import Set
|
||||
|
||||
from nanovllm.kvcache.policies.base_policy import EvictionPolicy
|
||||
|
||||
|
||||
class LRUPolicy(EvictionPolicy):
|
||||
"""
|
||||
Least Recently Used (LRU) eviction policy.
|
||||
|
||||
Maintains an ordered dictionary of block access times.
|
||||
When eviction is needed, selects the block that was
|
||||
accessed least recently.
|
||||
|
||||
Properties:
|
||||
- O(1) access tracking
|
||||
- O(n) victim selection in worst case, but typically fast
|
||||
due to OrderedDict iteration order
|
||||
- Good for workloads with temporal locality
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# OrderedDict maintains insertion/update order
|
||||
# Key: block_id, Value: last_access_step
|
||||
# Oldest (least recently used) is at the front
|
||||
self.access_order: OrderedDict[int, int] = OrderedDict()
|
||||
|
||||
def on_block_allocated(self, block_id: int, step: int) -> None:
|
||||
"""Record allocation as an access."""
|
||||
# Move to end (most recently used)
|
||||
self.access_order[block_id] = step
|
||||
self.access_order.move_to_end(block_id)
|
||||
|
||||
def on_block_access(self, block_id: int, step: int) -> None:
|
||||
"""Update access time and move to end."""
|
||||
if block_id in self.access_order:
|
||||
self.access_order[block_id] = step
|
||||
self.access_order.move_to_end(block_id)
|
||||
|
||||
def select_victim(self, candidates: Set[int]) -> int:
|
||||
"""
|
||||
Select the least recently used block from candidates.
|
||||
|
||||
Iterates from oldest to newest in access order,
|
||||
returns the first one that's in the candidate set.
|
||||
"""
|
||||
if not candidates:
|
||||
raise ValueError("Cannot select victim from empty candidate set")
|
||||
|
||||
# Iterate from oldest (front) to newest (back)
|
||||
for block_id in self.access_order:
|
||||
if block_id in candidates:
|
||||
return block_id
|
||||
|
||||
# Fallback: return any candidate (shouldn't happen normally)
|
||||
return next(iter(candidates))
|
||||
|
||||
def on_block_evicted(self, block_id: int) -> None:
|
||||
"""Remove block from tracking."""
|
||||
self.access_order.pop(block_id, None)
|
||||
|
||||
def on_block_deallocated(self, block_id: int) -> None:
|
||||
"""Remove block from tracking."""
|
||||
self.access_order.pop(block_id, None)
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Clear all tracking data."""
|
||||
self.access_order.clear()
|
||||
|
||||
def get_eviction_order(self, candidates: Set[int], count: int) -> list:
|
||||
"""
|
||||
Efficiently get multiple blocks to evict in LRU order.
|
||||
|
||||
Optimized for batch eviction - iterates through access_order
|
||||
once instead of calling select_victim() multiple times.
|
||||
"""
|
||||
result = []
|
||||
for block_id in self.access_order:
|
||||
if block_id in candidates:
|
||||
result.append(block_id)
|
||||
if len(result) >= count:
|
||||
break
|
||||
return result
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"LRUPolicy(tracked_blocks={len(self.access_order)})"
|
||||
Reference in New Issue
Block a user