nano-vllm/nanovllm/kvcache/sparse/__init__.py

"""
Sparse Attention Policy module.

Provides pluggable policies for selecting which KV blocks to load
during chunked attention with CPU offload.

Usage:
    from nanovllm.kvcache.sparse import create_sparse_policy, SparsePolicyType

    # Create policy using factory function
    policy = create_sparse_policy(SparsePolicyType.QUEST, topk_blocks=8)

    # Or create custom policy
    class MyPolicy(SparsePolicy):
        supports_prefill = True
        supports_decode = True

        def select_blocks(self, available_blocks, ctx):
            return available_blocks[:5]  # Just first 5 blocks
"""

from nanovllm.config import SparsePolicyType
from nanovllm.kvcache.sparse.policy import SparsePolicy, PolicyContext
from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
from nanovllm.kvcache.sparse.quest import QuestPolicy, QuestConfig, BlockMetadataManager


def create_sparse_policy(policy_type: SparsePolicyType, **kwargs) -> SparsePolicy:
    """
    Create a sparse policy instance from an enum type.

    The returned policy is not yet initialized. Call policy.initialize()
    or let the framework call it during KV cache allocation.

    Args:
        policy_type: SparsePolicyType enum value
        **kwargs: Policy-specific configuration options

    Returns:
        SparsePolicy instance (not initialized)

    Example:
        policy = create_sparse_policy(SparsePolicyType.QUEST, topk_blocks=4)
        policy.initialize(num_layers=28, num_kv_heads=8, ...)
    """
    if policy_type == SparsePolicyType.FULL:
        return FullAttentionPolicy()

    elif policy_type == SparsePolicyType.QUEST:
        config = QuestConfig(
            topk_blocks=kwargs.get("topk_blocks", 8),
            threshold_blocks=kwargs.get("threshold_blocks", 4),
            include_sink_blocks=kwargs.get("include_sink_blocks", 0),
            include_recent_blocks=kwargs.get("include_recent_blocks", 0),
        )
        return QuestPolicy(config)

    else:
        raise ValueError(f"Unknown policy type: {policy_type}")


__all__ = [
    "SparsePolicy",
    "PolicyContext",
    "SparsePolicyType",
    "FullAttentionPolicy",
    "QuestPolicy",
    "QuestConfig",
    "BlockMetadataManager",
    "create_sparse_policy",
]