From ed3c8bb4b8a9946a4ea340fd0e292e0a9e6a6f44 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Fri, 23 Jan 2026 09:30:18 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix:=20memory=20leak=20in=20XAtt?= =?UTF-8?q?entionBSAPolicy=20select=5Fblocks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix severe memory leak (64GB -> 4GB growth) by: - Remove unused sparse_metadata storage (was accumulating attn_scores) - Delete intermediate tensor list (attn_scores_list) after use - Explicitly delete intermediate tensors before return Before: 16GB -> 80GB during 128K prefill After: 16GB -> 19.8GB during 128K prefill Co-Authored-By: Claude Opus 4.5 --- nanovllm/kvcache/sparse/xattn_bsa.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/nanovllm/kvcache/sparse/xattn_bsa.py b/nanovllm/kvcache/sparse/xattn_bsa.py index 1fedf89..c8aebf7 100644 --- a/nanovllm/kvcache/sparse/xattn_bsa.py +++ b/nanovllm/kvcache/sparse/xattn_bsa.py @@ -235,8 +235,8 @@ class XAttentionBSAPolicy(SparsePolicy): return available_blocks attn_scores = torch.cat(attn_scores_list, dim=-1) - # Store in sparse_metadata for later use in compute_chunked_prefill - self.sparse_metadata[layer_id] = attn_scores + # Free intermediate list immediately + del attn_scores_list # Step 2: Apply softmax_fuse_block_sum to get block-level attention # block_size = reshaped_block_size so each CPU block maps to exactly 1 output block @@ -320,6 +320,9 @@ class XAttentionBSAPolicy(SparsePolicy): logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, " f"selected={len(selected_block_ids)}, chunk_density={chunk_density:.1%}") + # Free intermediate tensors to prevent memory leak + del attn_scores, block_sums, mask, mask_per_kv_head, vote_count, vote_ratio, block_selected + return selected_block_ids def compute_chunked_prefill(