From ed3c8bb4b8a9946a4ea340fd0e292e0a9e6a6f44 Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Fri, 23 Jan 2026 09:30:18 +0800
Subject: [PATCH] =?UTF-8?q?=F0=9F=90=9B=20fix:=20memory=20leak=20in=20XAtt?=
 =?UTF-8?q?entionBSAPolicy=20select=5Fblocks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix severe memory leak (64GB -> 4GB growth) by:
- Remove unused sparse_metadata storage (was accumulating attn_scores)
- Delete intermediate tensor list (attn_scores_list) after use
- Explicitly delete intermediate tensors before return

Before: 16GB -> 80GB during 128K prefill
After:  16GB -> 19.8GB during 128K prefill

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 nanovllm/kvcache/sparse/xattn_bsa.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nanovllm/kvcache/sparse/xattn_bsa.py b/nanovllm/kvcache/sparse/xattn_bsa.py
index 1fedf89..c8aebf7 100644
--- a/nanovllm/kvcache/sparse/xattn_bsa.py
+++ b/nanovllm/kvcache/sparse/xattn_bsa.py
@@ -235,8 +235,8 @@ class XAttentionBSAPolicy(SparsePolicy):
             return available_blocks
 
         attn_scores = torch.cat(attn_scores_list, dim=-1)
-        # Store in sparse_metadata for later use in compute_chunked_prefill
-        self.sparse_metadata[layer_id] = attn_scores
+        # Free intermediate list immediately
+        del attn_scores_list
 
         # Step 2: Apply softmax_fuse_block_sum to get block-level attention
         # block_size = reshaped_block_size so each CPU block maps to exactly 1 output block
@@ -320,6 +320,9 @@ class XAttentionBSAPolicy(SparsePolicy):
             logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, "
                         f"selected={len(selected_block_ids)}, chunk_density={chunk_density:.1%}")
 
+        # Free intermediate tensors to prevent memory leak
+        del attn_scores, block_sums, mask, mask_per_kv_head, vote_count, vote_ratio, block_selected
+
         return selected_block_ids
 
     def compute_chunked_prefill(