✨ feat: add configurable stride and chunk_size for XAttention BSA

- Add sparse_chunk_size config option (default: 16384) - Pass stride, chunk_size, use_triton through factory function - Add --sparse-stride CLI option to test_ruler.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-23 10:37:04 +08:00
parent f28b500120
commit 7c41032a2e
4 changed files with 10 additions and 0 deletions
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -51,6 +51,7 @@ class Config:
    sparse_threshold: float = 0.95  # Cumulative attention threshold (tau in XAttention)
    sparse_use_triton: bool = True  # Use Triton kernels for estimation
    sparse_stride: int = 8  # Stride for Q/K downsampling
+    sparse_chunk_size: int = 16384  # Triton kernel chunk size for estimation

    def __post_init__(self):
        assert os.path.isdir(self.model)