[feat] Added sparse KVcache feature, NEED VERIFY.
This commit is contained in:
34
nanovllm/kvcache/sparse/full_policy.py
Normal file
34
nanovllm/kvcache/sparse/full_policy.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""
|
||||
Full attention policy - loads all blocks (no sparsity).
|
||||
|
||||
This serves as a baseline and default policy when sparse
|
||||
attention is not needed.
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
from .policy import SparsePolicy, PolicyContext
|
||||
|
||||
|
||||
class FullAttentionPolicy(SparsePolicy):
|
||||
"""
|
||||
Full attention policy that loads all available blocks.
|
||||
|
||||
This is the default behavior with no sparsity - all previous
|
||||
KV cache blocks are loaded for each query chunk.
|
||||
|
||||
Use this as:
|
||||
- A baseline for comparing sparse policies
|
||||
- When you need full attention accuracy
|
||||
- For short sequences where sparsity isn't beneficial
|
||||
"""
|
||||
|
||||
def select_blocks(
|
||||
self,
|
||||
available_blocks: List[int],
|
||||
ctx: PolicyContext,
|
||||
) -> List[int]:
|
||||
"""Return all blocks - no sparsity."""
|
||||
return available_blocks
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return "FullAttentionPolicy()"
|
||||
Reference in New Issue
Block a user