✨ feat: add GPU-only XAttention BSA sparse attention support

- Implement compute_prefill() in XAttentionBSAPolicy for GPU-only mode - Uses xattn_estimate to compute sparse block mask - Uses block_sparse_attn_func for efficient sparse attention - Handles GQA by expanding K/V heads - Falls back to flash_attn for paged KV cache (prefix cache) - Implement compute_decode() by delegating to FullAttentionPolicy - Add --policy xattn option to bench.py Verified: RULER 32k niah_single_1 5/5 samples passed (100%) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
2026-01-27 05:19:24 +08:00
parent b6b59b50ed
commit 076656c9c2
2 changed files with 207 additions and 1 deletions
--- a/bench.py
+++ b/bench.py
@@ -51,6 +51,9 @@ def main():
    parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)")
    parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks")
    # Sparse policy option (GPU-only mode now supports policy routing)
+    parser.add_argument("--policy", type=str, default=None,
+                        choices=["full", "xattn"],
+                        help="Sparse policy: full (FullAttention), xattn (XAttention+BSA)")
    parser.add_argument("--enable-policy", action="store_true",
                        help="Enable sparse policy routing (FullAttentionPolicy by default)")
    args = parser.parse_args()
@@ -59,7 +62,10 @@ def main():
    max_len = args.max_len

    # Configure sparse policy
-    if args.enable_policy:
+    if args.policy == "xattn":
+        sparse_policy = SparsePolicyType.XATTN_BSA
+        print(f"\n[nanovllm GPU + XAttention BSA] max_len={max_len}")
+    elif args.policy == "full" or args.enable_policy:
        sparse_policy = SparsePolicyType.FULL
        print(f"\n[nanovllm GPU + Policy] sparse_policy=FULL, max_len={max_len}")
    else:
--- a/nanovllm/kvcache/sparse/xattn_bsa.py
+++ b/nanovllm/kvcache/sparse/xattn_bsa.py
@@ -122,6 +122,206 @@ class XAttentionBSAPolicy(SparsePolicy):
        self._stats_total_selected_blocks = 0
        self._stats_num_chunks = 0

+    # =========================================================================
+    # GPU-only methods (non-chunked)
+    # =========================================================================
+
+    def compute_prefill(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        cu_seqlens_q: torch.Tensor,
+        cu_seqlens_k: torch.Tensor,
+        max_seqlen_q: int,
+        max_seqlen_k: int,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        GPU-only prefill attention using XAttention + BSA.
+
+        This method implements sparse attention for GPU-only mode:
+        1. Estimate block importance using xattn_estimate
+        2. Compute sparse attention using block_sparse_attn_func
+
+        Args:
+            q: Query tensor [total_q, num_heads, head_dim] (varlen packed)
+            k: Key tensor [total_kv, num_kv_heads, head_dim] (varlen packed)
+            v: Value tensor [total_kv, num_kv_heads, head_dim] (varlen packed)
+            cu_seqlens_q: Cumulative sequence lengths for Q [batch+1]
+            cu_seqlens_k: Cumulative sequence lengths for K [batch+1]
+            max_seqlen_q: Maximum Q sequence length
+            max_seqlen_k: Maximum K sequence length
+            softmax_scale: Softmax scaling factor
+            layer_id: Transformer layer index
+            block_tables: Paged attention block tables (not used for XAttention)
+
+        Returns:
+            Attention output [total_q, num_heads, head_dim]
+        """
+        # When block_tables is provided (paged KV cache / prefix cache),
+        # fallback to flash_attn as XAttention expects contiguous K, V
+        if block_tables is not None:
+            from flash_attn import flash_attn_varlen_func
+            return flash_attn_varlen_func(
+                q, k, v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=softmax_scale,
+                causal=True,
+                block_table=block_tables,
+            )
+
+        if not BSA_AVAILABLE:
+            # Fallback to flash attention if BSA not available
+            from flash_attn import flash_attn_varlen_func
+            return flash_attn_varlen_func(
+                q, k, v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+        if not XATTN_AVAILABLE:
+            # Fallback to flash attention if xattn not available
+            from flash_attn import flash_attn_varlen_func
+            return flash_attn_varlen_func(
+                q, k, v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+        from nanovllm.ops.xattn import xattn_estimate
+
+        # Get dimensions
+        total_q, num_heads, head_dim = q.shape
+        total_kv, num_kv_heads, _ = k.shape
+
+        # For now, assume batch_size = 1 (single sequence)
+        # TODO: Support batched varlen format
+        batch_size = cu_seqlens_q.shape[0] - 1
+        if batch_size != 1:
+            # Fallback to flash attention for batched input
+            from flash_attn import flash_attn_varlen_func
+            logger.warning(f"[XAttn] batch_size={batch_size} > 1, falling back to flash attention")
+            return flash_attn_varlen_func(
+                q, k, v,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_q,
+                max_seqlen_k=max_seqlen_k,
+                softmax_scale=softmax_scale,
+                causal=True,
+            )
+
+        q_len = max_seqlen_q
+        k_len = max_seqlen_k
+
+        # Convert from varlen format [total, heads, dim] to [batch, heads, seq, dim]
+        # q: [q_len, num_heads, head_dim] -> [1, num_heads, q_len, head_dim]
+        Q = q.unsqueeze(0).transpose(1, 2)  # [1, num_heads, q_len, head_dim]
+        K = k.unsqueeze(0).transpose(1, 2)  # [1, num_kv_heads, k_len, head_dim]
+        V = v.unsqueeze(0).transpose(1, 2)  # [1, num_kv_heads, k_len, head_dim]
+
+        # Expand KV for GQA (xattn_estimate requires matching heads)
+        K_exp, V_exp = expand_kv_for_gqa(K, V, num_heads)
+
+        # Estimate block importance and get sparse mask
+        _, mask = xattn_estimate(
+            Q, K_exp,
+            chunk_size=self.chunk_size,
+            block_size=self.BSA_BLOCK_SIZE,
+            threshold=self.threshold,
+            use_triton=self.use_triton,
+            causal=True,
+        )
+
+        # Compute block counts
+        q_block_num = (q_len + self.BSA_BLOCK_SIZE - 1) // self.BSA_BLOCK_SIZE
+        k_block_num = (k_len + self.BSA_BLOCK_SIZE - 1) // self.BSA_BLOCK_SIZE
+
+        # Prepare tensors for BSA
+        # q, k, v need to be [seq_len, num_heads, head_dim]
+        q_bsa = q  # Already [q_len, num_heads, head_dim]
+
+        # For GQA with BSA, we need to expand k, v to match num_heads
+        # k, v: [k_len, num_kv_heads, head_dim] -> [k_len, num_heads, head_dim]
+        if num_heads != num_kv_heads:
+            num_groups = num_heads // num_kv_heads
+            k_bsa = k.repeat_interleave(num_groups, dim=1)
+            v_bsa = v.repeat_interleave(num_groups, dim=1)
+        else:
+            k_bsa = k
+            v_bsa = v
+
+        # Prepare BSA inputs
+        cu_seqlens_q_bsa = torch.tensor([0, q_len], dtype=torch.int32, device=q.device)
+        cu_seqlens_k_bsa = torch.tensor([0, k_len], dtype=torch.int32, device=k.device)
+        head_groups = torch.ones(num_heads, dtype=torch.int32, device=q.device)
+
+        # Trim mask to actual block counts
+        mask_trimmed = mask[:, :, :q_block_num, :k_block_num].contiguous()
+
+        # Compute sparse attention using BSA
+        output = block_sparse_attn_func(
+            q_bsa, k_bsa, v_bsa,
+            cu_seqlens_q_bsa,
+            cu_seqlens_k_bsa,
+            head_groups,
+            None,  # key_padding_mask
+            mask_trimmed,
+            q_len, k_len,
+            p_dropout=0.0,
+            deterministic=True,
+            is_causal=True,
+        )
+
+        # Update statistics (layer 0 only to avoid overcounting)
+        if layer_id == 0:
+            selected_blocks = mask_trimmed.sum().item()
+            total_blocks = q_block_num * k_block_num * num_heads
+            density = selected_blocks / total_blocks if total_blocks > 0 else 1.0
+            logger.debug(f"[XAttn GPU-only] layer={layer_id}, q_blocks={q_block_num}, "
+                        f"k_blocks={k_block_num}, density={density:.1%}")
+
+        return output
+
+    def compute_decode(
+        self,
+        q: torch.Tensor,
+        k_cache: torch.Tensor,
+        v_cache: torch.Tensor,
+        cache_seqlens: torch.Tensor,
+        softmax_scale: float,
+        layer_id: int,
+        block_tables: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """
+        GPU-only decode attention - delegates to FullAttentionPolicy.
+
+        XAttention is designed for long prefill sequences. For decode (single token),
+        we use FullAttentionPolicy which calls flash_attn_with_kvcache.
+        """
+        from nanovllm.kvcache.sparse.full_policy import FullAttentionPolicy
+        return FullAttentionPolicy().compute_decode(
+            q, k_cache, v_cache, cache_seqlens, softmax_scale, layer_id, block_tables
+        )
+
+    # =========================================================================
+    # Chunked offload methods
+    # =========================================================================
+
    def select_blocks(
        self,
        available_blocks: List[int],