[claudesquad] update from 'layer-prefill-1' on 08 Jan 26 03:36 CST

2026-01-08 03:36:39 +08:00
parent 6575099a06
commit d8a87da1c3
10 changed files with 822 additions and 32 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -9,6 +9,7 @@ class SparsePolicyType(Enum):
    """Sparse attention policy types."""
    FULL = auto()   # No sparse attention (load all blocks)
    QUEST = auto()  # Query-aware Top-K block selection (decode only)
+    MINFERENCE = auto()  # MInference vertical + slash sparse prefill (GPU-only)


@dataclass
@@ -39,10 +40,18 @@ class Config:
    # Sparse attention configuration
    # Quest: decode-only sparse attention with Top-K block selection
    # FULL: no sparse attention (load all blocks)
+    # MINFERENCE: MInference vertical + slash sparse prefill (GPU-only)
    sparse_policy: SparsePolicyType = SparsePolicyType.FULL
    sparse_topk_blocks: int = 8  # Top-K blocks for Quest
    sparse_threshold_blocks: int = 4  # Apply sparse only when blocks > threshold

+    # MInference configuration (used when sparse_policy == MINFERENCE)
+    minference_adaptive_budget: float = 0.3  # Budget as fraction of seq_len (None to use fixed sizes)
+    minference_vertical_size: int = 1000  # Fixed vertical size (if adaptive_budget is None)
+    minference_slash_size: int = 6096  # Fixed slash size (if adaptive_budget is None)
+    minference_num_sink_tokens: int = 30  # Sink tokens to always keep
+    minference_num_recent_diags: int = 100  # Recent diagonals to always keep
+
    def __post_init__(self):
        assert os.path.isdir(self.model)
        assert self.kvcache_block_size % 256 == 0