35 lines
897 B
Python
35 lines
897 B
Python
"""
|
|
Full attention policy - loads all blocks (no sparsity).
|
|
|
|
This serves as a baseline and default policy when sparse
|
|
attention is not needed.
|
|
"""
|
|
|
|
from typing import List
|
|
from .policy import SparsePolicy, PolicyContext
|
|
|
|
|
|
class FullAttentionPolicy(SparsePolicy):
|
|
"""
|
|
Full attention policy that loads all available blocks.
|
|
|
|
This is the default behavior with no sparsity - all previous
|
|
KV cache blocks are loaded for each query chunk.
|
|
|
|
Use this as:
|
|
- A baseline for comparing sparse policies
|
|
- When you need full attention accuracy
|
|
- For short sequences where sparsity isn't beneficial
|
|
"""
|
|
|
|
def select_blocks(
|
|
self,
|
|
available_blocks: List[int],
|
|
ctx: PolicyContext,
|
|
) -> List[int]:
|
|
"""Return all blocks - no sparsity."""
|
|
return available_blocks
|
|
|
|
def __repr__(self) -> str:
|
|
return "FullAttentionPolicy()"
|