Files
nano-vllm/nanovllm/kvcache/sparse/full_policy.py
2026-01-06 20:47:55 +08:00

39 lines
1007 B
Python

"""
Full attention policy - loads all blocks (no sparsity).
This serves as a baseline and default policy when sparse
attention is not needed.
"""
from typing import List
from .policy import SparsePolicy, PolicyContext
class FullAttentionPolicy(SparsePolicy):
"""
Full attention policy that loads all available blocks.
This is the default behavior with no sparsity - all previous
KV cache blocks are loaded for each query chunk.
Use this as:
- A baseline for comparing sparse policies
- When you need full attention accuracy
- For short sequences where sparsity isn't beneficial
"""
# Full attention supports both prefill and decode
supports_prefill = True
supports_decode = True
def select_blocks(
self,
available_blocks: List[int],
ctx: PolicyContext,
) -> List[int]:
"""Return all blocks - no sparsity."""
return available_blocks
def __repr__(self) -> str:
return "FullAttentionPolicy()"