diff --git a/nanovllm/kvcache/sparse/xattn_bsa.py b/nanovllm/kvcache/sparse/xattn_bsa.py index f94d27e..4cb1c90 100644 --- a/nanovllm/kvcache/sparse/xattn_bsa.py +++ b/nanovllm/kvcache/sparse/xattn_bsa.py @@ -905,6 +905,15 @@ class XAttentionBSAPolicy(SparsePolicy): self._stats_total_selected_blocks += len(selected_block_ids) self._stats_num_chunks += 1 + # Record communication density to DensityObserver + # Comm density = selected_cpu_blocks / available_cpu_blocks + # This is different from compute density (BSA block granularity) + DensityObserver.record_comm_density( + layer_id=layer_id, + selected_cpu_blocks=len(selected_block_ids), + total_cpu_blocks=len(available_blocks), + ) + # Log per-chunk density chunk_density = len(selected_block_ids) / len(available_blocks) logger.debug(f"[XAttn] chunk={ctx.query_chunk_idx}, available={len(available_blocks)}, " diff --git a/nanovllm/utils/density_observer.py b/nanovllm/utils/density_observer.py index 4bc14f0..fb0c14a 100644 --- a/nanovllm/utils/density_observer.py +++ b/nanovllm/utils/density_observer.py @@ -266,14 +266,31 @@ class DensityObserver(Observer): return 0.0 return sum(all_densities) / len(all_densities) + @classmethod + def get_per_layer_comm_density(cls) -> Dict[int, float]: + """ + 获取每层的 communication density (CPU block 粒度)。 + + Returns: + Dict[layer_id, avg_comm_density] + """ + result = {} + for layer_id, densities in cls._layer_comm_densities.items(): + if densities: + result[layer_id] = sum(densities) / len(densities) + return result + @classmethod def get_summary(cls) -> dict: """返回统计摘要""" per_layer = cls.get_per_layer_density() + per_layer_comm = cls.get_per_layer_comm_density() return { "mode": cls._mode, - "overall_density": cls.get_overall_density(), - "per_layer_density": per_layer, + "overall_compute_density": cls.get_overall_density(), + "overall_comm_density": cls.get_overall_comm_density(), + "per_layer_compute_density": per_layer, + "per_layer_comm_density": per_layer_comm, "num_layers": len(per_layer), "last_mask_shape": { "q_blocks": cls._last_q_blocks, @@ -301,7 +318,9 @@ class DensityObserver(Observer): print(f"[DensityObserver] Mode: {cls._mode}") print(f" Compute density: {overall:.4f} (min: {min_density:.4f} @ layer {min_layer})") if overall_comm > 0: - print(f" Comm density: {overall_comm:.4f}") + # Offload mode: show both densities with explanation + print(f" Comm density: {overall_comm:.4f} (CPU block granularity)") + print(f" Savings ratio: {1 - overall_comm:.1%} H2D transfer reduction") print(f" Num layers: {len(per_layer)}") # 输出 layer 0 的 density 用于对比 if 0 in per_layer: diff --git a/tests/test_ruler.py b/tests/test_ruler.py index df95ff8..fad110f 100644 --- a/tests/test_ruler.py +++ b/tests/test_ruler.py @@ -386,8 +386,11 @@ def run_ruler_benchmark( if sparse_policy and sparse_policy.upper() == "XATTN_BSA": DensityObserver.enable() DensityObserver.complete_reset() + # Set mode for correct density interpretation + DensityObserver.set_mode("offload" if enable_cpu_offload else "gpu_only") if not json_output: - print("[DensityObserver] Enabled for XAttention BSA") + mode_str = "offload" if enable_cpu_offload else "gpu_only" + print(f"[DensityObserver] Enabled for XAttention BSA (mode: {mode_str})") # LLM initialization kwargs llm_kwargs = {