Files
nano-vllm/nanovllm/kvcache/sparse/full_policy.py
2025-12-22 08:51:02 +08:00

35 lines
897 B
Python

"""
Full attention policy - loads all blocks (no sparsity).
This serves as a baseline and default policy when sparse
attention is not needed.
"""
from typing import List
from .policy import SparsePolicy, PolicyContext
class FullAttentionPolicy(SparsePolicy):
"""
Full attention policy that loads all available blocks.
This is the default behavior with no sparsity - all previous
KV cache blocks are loaded for each query chunk.
Use this as:
- A baseline for comparing sparse policies
- When you need full attention accuracy
- For short sequences where sparsity isn't beneficial
"""
def select_blocks(
self,
available_blocks: List[int],
ctx: PolicyContext,
) -> List[int]:
"""Return all blocks - no sparsity."""
return available_blocks
def __repr__(self) -> str:
return "FullAttentionPolicy()"