📊 feat: distinguish compute density and communication density in DensityObserver
- Add record_comm_density() call in select_blocks to track CPU block selection - Add get_per_layer_comm_density() method for detailed analysis - Update print_summary() to show both densities and H2D savings ratio - Set DensityObserver mode (offload/gpu_only) in test_ruler.py - Update get_summary() to return both density types Key insight: Comm density can be 100% even when compute density is ~37% because sparse BSA blocks are distributed across all CPU blocks. Since CPU block granularity is 32x coarser (4096 vs 128 tokens), any() aggregation across heads/Q-blocks results in all CPU blocks being needed. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -905,6 +905,15 @@ class XAttentionBSAPolicy(SparsePolicy):
|
|||||||
self._stats_total_selected_blocks += len(selected_block_ids)
|
self._stats_total_selected_blocks += len(selected_block_ids)
|
||||||
self._stats_num_chunks += 1
|
self._stats_num_chunks += 1
|
||||||
|
|
||||||
|
# Record communication density to DensityObserver
|
||||||
|
# Comm density = selected_cpu_blocks / available_cpu_blocks
|
||||||
|
# This is different from compute density (BSA block granularity)
|
||||||
|
DensityObserver.record_comm_density(
|
||||||
|
layer_id=layer_id,
|
||||||
|
selected_cpu_blocks=len(selected_block_ids),
|
||||||
|
total_cpu_blocks=len(available_blocks),
|
||||||
|
)
|
||||||
|
|
||||||
# Log per-chunk density
|
# Log per-chunk density
|
||||||
chunk_density = len(selected_block_ids) / len(available_blocks)
|
chunk_density = len(selected_block_ids) / len(available_blocks)
|
||||||
logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, "
|
logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, "
|
||||||
|
|||||||
@@ -266,14 +266,31 @@ class DensityObserver(Observer):
|
|||||||
return 0.0
|
return 0.0
|
||||||
return sum(all_densities) / len(all_densities)
|
return sum(all_densities) / len(all_densities)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_per_layer_comm_density(cls) -> Dict[int, float]:
|
||||||
|
"""
|
||||||
|
获取每层的 communication density (CPU block 粒度)。
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[layer_id, avg_comm_density]
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
for layer_id, densities in cls._layer_comm_densities.items():
|
||||||
|
if densities:
|
||||||
|
result[layer_id] = sum(densities) / len(densities)
|
||||||
|
return result
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_summary(cls) -> dict:
|
def get_summary(cls) -> dict:
|
||||||
"""返回统计摘要"""
|
"""返回统计摘要"""
|
||||||
per_layer = cls.get_per_layer_density()
|
per_layer = cls.get_per_layer_density()
|
||||||
|
per_layer_comm = cls.get_per_layer_comm_density()
|
||||||
return {
|
return {
|
||||||
"mode": cls._mode,
|
"mode": cls._mode,
|
||||||
"overall_density": cls.get_overall_density(),
|
"overall_compute_density": cls.get_overall_density(),
|
||||||
"per_layer_density": per_layer,
|
"overall_comm_density": cls.get_overall_comm_density(),
|
||||||
|
"per_layer_compute_density": per_layer,
|
||||||
|
"per_layer_comm_density": per_layer_comm,
|
||||||
"num_layers": len(per_layer),
|
"num_layers": len(per_layer),
|
||||||
"last_mask_shape": {
|
"last_mask_shape": {
|
||||||
"q_blocks": cls._last_q_blocks,
|
"q_blocks": cls._last_q_blocks,
|
||||||
@@ -301,7 +318,9 @@ class DensityObserver(Observer):
|
|||||||
print(f"[DensityObserver] Mode: {cls._mode}")
|
print(f"[DensityObserver] Mode: {cls._mode}")
|
||||||
print(f" Compute density: {overall:.4f} (min: {min_density:.4f} @ layer {min_layer})")
|
print(f" Compute density: {overall:.4f} (min: {min_density:.4f} @ layer {min_layer})")
|
||||||
if overall_comm > 0:
|
if overall_comm > 0:
|
||||||
print(f" Comm density: {overall_comm:.4f}")
|
# Offload mode: show both densities with explanation
|
||||||
|
print(f" Comm density: {overall_comm:.4f} (CPU block granularity)")
|
||||||
|
print(f" Savings ratio: {1 - overall_comm:.1%} H2D transfer reduction")
|
||||||
print(f" Num layers: {len(per_layer)}")
|
print(f" Num layers: {len(per_layer)}")
|
||||||
# 输出 layer 0 的 density 用于对比
|
# 输出 layer 0 的 density 用于对比
|
||||||
if 0 in per_layer:
|
if 0 in per_layer:
|
||||||
|
|||||||
@@ -386,8 +386,11 @@ def run_ruler_benchmark(
|
|||||||
if sparse_policy and sparse_policy.upper() == "XATTN_BSA":
|
if sparse_policy and sparse_policy.upper() == "XATTN_BSA":
|
||||||
DensityObserver.enable()
|
DensityObserver.enable()
|
||||||
DensityObserver.complete_reset()
|
DensityObserver.complete_reset()
|
||||||
|
# Set mode for correct density interpretation
|
||||||
|
DensityObserver.set_mode("offload" if enable_cpu_offload else "gpu_only")
|
||||||
if not json_output:
|
if not json_output:
|
||||||
print("[DensityObserver] Enabled for XAttention BSA")
|
mode_str = "offload" if enable_cpu_offload else "gpu_only"
|
||||||
|
print(f"[DensityObserver] Enabled for XAttention BSA (mode: {mode_str})")
|
||||||
|
|
||||||
# LLM initialization kwargs
|
# LLM initialization kwargs
|
||||||
llm_kwargs = {
|
llm_kwargs = {
|
||||||
|
|||||||
Reference in New Issue
Block a user