[WIP] need refactor.

2026-01-22 22:20:34 +08:00
parent 69b779e252
commit 5fb0f67295
11 changed files with 514 additions and 548 deletions
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -98,10 +98,10 @@ class Attention(nn.Module):
                                           max_seqlen_q=context.max_seqlen_q, cu_seqlens_q=context.cu_seqlens_q,
                                           max_seqlen_k=context.max_seqlen_k, cu_seqlens_k=context.cu_seqlens_k,
                                           softmax_scale=self.scale, causal=True, block_table=context.block_tables)
-            elif context.sparse_prefill_policy is not None:
-                # Sparse prefill (GPU-only) - delegate to policy
-                o = context.sparse_prefill_policy.sparse_prefill_attention(
-                    q, k, v, self.layer_id
+            elif context.attention_policy is not None:
+                # Attention via policy (GPU-only) - delegate to policy
+                o = context.attention_policy.compute_prefill(
+                    q, k, v, self.layer_id, softmax_scale=self.scale
                )
            else:
                o = flash_attn_varlen_func(q, k, v,