[WIP] Before add Quest policy.

2026-01-07 02:32:30 +08:00
parent f240903013
commit c99a6f3d3f
11 changed files with 166 additions and 191 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -29,8 +29,9 @@ class Config:
    num_gpu_kvcache_blocks: int = -1
    num_cpu_kvcache_blocks: int = -1

-    # Sparse attention configuration
-    sparse_policy: str | None = None  # "vertical_slash", "quest", "streaming_llm", or None
+    # Sparse attention configuration (dual policy architecture)
+    prefill_policy: str = "full"  # "full", "quest", "vertical_slash", "streaming_llm"
+    decode_policy: str = "full"   # "full", "quest", "vertical_slash", "streaming_llm"
    sparse_num_sink_blocks: int = 1  # Number of sink blocks for sparse patterns
    sparse_local_window_blocks: int = 2  # Local window size for VerticalSlash
    sparse_topk_blocks: int = 8  # Top-K blocks for Quest