🐛 fix: memory leak in XAttentionBSAPolicy select_blocks
Fix severe memory leak (64GB -> 4GB growth) by: - Remove unused sparse_metadata storage (was accumulating attn_scores) - Delete intermediate tensor list (attn_scores_list) after use - Explicitly delete intermediate tensors before return Before: 16GB -> 80GB during 128K prefill After: 16GB -> 19.8GB during 128K prefill Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -235,8 +235,8 @@ class XAttentionBSAPolicy(SparsePolicy):
|
|||||||
return available_blocks
|
return available_blocks
|
||||||
|
|
||||||
attn_scores = torch.cat(attn_scores_list, dim=-1)
|
attn_scores = torch.cat(attn_scores_list, dim=-1)
|
||||||
# Store in sparse_metadata for later use in compute_chunked_prefill
|
# Free intermediate list immediately
|
||||||
self.sparse_metadata[layer_id] = attn_scores
|
del attn_scores_list
|
||||||
|
|
||||||
# Step 2: Apply softmax_fuse_block_sum to get block-level attention
|
# Step 2: Apply softmax_fuse_block_sum to get block-level attention
|
||||||
# block_size = reshaped_block_size so each CPU block maps to exactly 1 output block
|
# block_size = reshaped_block_size so each CPU block maps to exactly 1 output block
|
||||||
@@ -320,6 +320,9 @@ class XAttentionBSAPolicy(SparsePolicy):
|
|||||||
logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, "
|
logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, "
|
||||||
f"selected={len(selected_block_ids)}, chunk_density={chunk_density:.1%}")
|
f"selected={len(selected_block_ids)}, chunk_density={chunk_density:.1%}")
|
||||||
|
|
||||||
|
# Free intermediate tensors to prevent memory leak
|
||||||
|
del attn_scores, block_sums, mask, mask_per_kv_head, vote_count, vote_ratio, block_selected
|
||||||
|
|
||||||
return selected_block_ids
|
return selected_block_ids
|
||||||
|
|
||||||
def compute_chunked_prefill(
|
def compute_chunked_prefill(
|
||||||
|
|||||||
Reference in New Issue
Block a user