[WIP] Before refactor the compute)_chunked_prefill.

2026-01-23 03:36:12 +08:00
parent edc006463b
commit ca32ea6f93
7 changed files with 914 additions and 114 deletions
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -258,8 +258,12 @@ class Attention(nn.Module):
            raise RuntimeError("sparse_policy is required for chunked decode")

        # Check if policy supports decode phase
+        # If not, fallback to FullAttentionPolicy (e.g., XAttentionBSAPolicy only supports prefill)
        if not sparse_policy.supports_decode:
-            raise RuntimeError(f"{sparse_policy} does not support decode phase")
+            from nanovllm.kvcache.sparse import FullAttentionPolicy
+            sparse_policy = FullAttentionPolicy()
+            logger.debug(f"[DEBUG] {kvcache_manager.sparse_policy} doesn't support decode, "
+                         f"falling back to FullAttentionPolicy")

        # [DEBUG] Verify execution path
        logger.debug(f"[DEBUG] Calling sparse_policy.compute_chunked_decode, "