[feat] Added num_gpu_blocks limit gpu blocks.

This commit is contained in:
Zijie Tian
2025-12-10 20:17:42 +08:00
parent 01f19ee4a6
commit 0a247ccb1b
7 changed files with 150 additions and 9 deletions

View File

@@ -245,5 +245,5 @@ class Attention(nn.Module):
if o_acc is None:
raise RuntimeError("Chunked decode attention failed: no KV available")
# Output shape: [batch, 1, heads, dim] -> [batch, heads, dim]
return o_acc.squeeze(1)
# Output shape: [batch, 1, heads, dim] (same as normal decode)
return o_acc