[feat] Added num_gpu_blocks limit gpu blocks.

2025-12-10 20:17:42 +08:00
parent 01f19ee4a6
commit 0a247ccb1b
7 changed files with 150 additions and 9 deletions
--- a/nanovllm/layers/attention.py
+++ b/nanovllm/layers/attention.py
@@ -245,5 +245,5 @@ class Attention(nn.Module):
        if o_acc is None:
            raise RuntimeError("Chunked decode attention failed: no KV available")

-        # Output shape: [batch, 1, heads, dim] -> [batch, heads, dim]
-        return o_acc.squeeze(1)
+        # Output shape: [batch, 1, heads, dim] (same as normal decode)
+        return o_acc