feat: add xattn_estimate_chunked for chunked prefill support

- Add xattn_estimate_chunked function ported from COMPASS - Support chunked prefill with q_start_pos parameter - Ensure 100% consistency with standard xattn_estimate when using matching chunk_size parameter - Add test and documentation Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 01:13:17 +08:00
parent 2866d4fd88
commit bc92c1fdb8
5 changed files with 561 additions and 0 deletions
--- a/nanovllm/ops/init.py
+++ b/nanovllm/ops/init.py
@@ -13,6 +13,7 @@ from nanovllm.ops.chunked_attention import (

 from nanovllm.ops.xattn import (
    xattn_estimate,
+    xattn_estimate_chunked,
    flat_group_gemm_fuse_reshape,
    softmax_fuse_block_sum,
    find_blocks_chunked,
@@ -28,6 +29,7 @@ __all__ = [
    "ChunkedPrefillState",
    # xattn
    "xattn_estimate",
+    "xattn_estimate_chunked",
    "flat_group_gemm_fuse_reshape",
    "softmax_fuse_block_sum",
    "find_blocks_chunked",