From 73c9dc46fff09a87b59e473f58c51132491cf02e Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Tue, 27 Jan 2026 04:20:16 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20XAttention=20BSA=20su?= =?UTF-8?q?pport=20to=20bench=5Foffload.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add --model parameter (default: Llama-3.1-8B-Instruct) - Add --enable-xattn flag for XAttention BSA sparse prefill - Add --xattn-threshold and --xattn-stride parameters - Change default num-gpu-blocks from 6 to 4 - Add benchmark results doc with Full vs XAttn comparison (32K/128K) Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- CLAUDE.md | 1 + bench_offload.py | 29 ++++++++++-- docs/bench_offload_results.md | 89 +++++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 docs/bench_offload_results.md diff --git a/CLAUDE.md b/CLAUDE.md index b302c56..b7e3647 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,6 +27,7 @@ Nano-vLLM is a lightweight vLLM implementation (~1,200 lines) for fast offline L | [`docs/chunked_attention_solutions.md`](docs/chunked_attention_solutions.md) | 🔧 SOLUTIONS: Chunked attention 准确性问题的代码分析和解决方案 | | [`docs/nsys_wrong_event_order_bug.md`](docs/nsys_wrong_event_order_bug.md) | 🐛 NSYS BUG: Ring buffer pipeline 触发 nsys 时间戳乱序问题的调试记录 | | [`docs/cpu_scheduling_latency_analysis.md`](docs/cpu_scheduling_latency_analysis.md) | ⚡ PERF: CPU 调度延迟分析,kernel 间隙来源,GPU 利用率优化方向 | +| [`docs/bench_offload_results.md`](docs/bench_offload_results.md) | 📊 BENCH: CPU offload 性能测试结果,Full vs XAttention 对比 (32K/128K) | ## Rules Index diff --git a/bench_offload.py b/bench_offload.py index 3a1bbaa..e650bbb 100644 --- a/bench_offload.py +++ b/bench_offload.py @@ -46,24 +46,41 @@ def main(): from nanovllm.config import SparsePolicyType parser = argparse.ArgumentParser(description="Benchmark CPU offload performance") - parser.add_argument("--enable-quest", action="store_true", help="Enable Quest sparse attention for decode") + parser.add_argument("--model", type=str, default="~/models/Llama-3.1-8B-Instruct", + help="Model path (default: ~/models/Llama-3.1-8B-Instruct)") + # Sparse policy selection (mutually exclusive) + sparse_group = parser.add_mutually_exclusive_group() + sparse_group.add_argument("--enable-quest", action="store_true", + help="Enable Quest sparse attention (decode only, prefill uses full)") + sparse_group.add_argument("--enable-xattn", action="store_true", + help="Enable XAttention BSA (prefill only, decode uses full)") + # Quest parameters parser.add_argument("--topk", type=int, default=16, help="Top-K blocks for Quest (default: 16)") parser.add_argument("--threshold", type=int, default=4, help="Apply sparse only when blocks > threshold (default: 4)") + # XAttention parameters + parser.add_argument("--xattn-threshold", type=float, default=0.95, + help="XAttention cumulative attention threshold (default: 0.95)") + parser.add_argument("--xattn-stride", type=int, default=8, + help="XAttention Q/K downsampling stride (default: 8)") + # General parameters parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens") parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)") - parser.add_argument("--num-gpu-blocks", type=int, default=6, help="Number of GPU blocks (default: 6)") + parser.add_argument("--num-gpu-blocks", type=int, default=4, help="Number of GPU blocks (default: 4)") parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)") parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)") parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks") args = parser.parse_args() - path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") + path = os.path.expanduser(args.model) max_len = args.max_len # Setup policy configuration if args.enable_quest: sparse_policy = SparsePolicyType.QUEST - print(f"\n[Quest Sparse Attention] topk={args.topk}, threshold={args.threshold}") + print(f"\n[Quest Sparse Attention] decode: Quest (topk={args.topk}, threshold={args.threshold}), prefill: Full") + elif args.enable_xattn: + sparse_policy = SparsePolicyType.XATTN_BSA + print(f"\n[XAttention BSA] prefill: XAttn (tau={args.xattn_threshold}, stride={args.xattn_stride}), decode: Full") else: sparse_policy = SparsePolicyType.FULL print("\n[Full Attention] baseline (no sparse)") @@ -78,8 +95,12 @@ def main(): enable_cpu_offload=True, num_gpu_blocks=args.num_gpu_blocks, sparse_policy=sparse_policy, + # Quest parameters sparse_topk_blocks=args.topk, sparse_threshold_blocks=args.threshold, + # XAttention parameters + sparse_threshold=args.xattn_threshold, + sparse_stride=args.xattn_stride, ) # Warmup diff --git a/docs/bench_offload_results.md b/docs/bench_offload_results.md new file mode 100644 index 0000000..9a3ff7b --- /dev/null +++ b/docs/bench_offload_results.md @@ -0,0 +1,89 @@ +# CPU Offload Benchmark Results + +本文档记录 `bench_offload.py` 在不同配置下的性能测试结果。 + +## 测试环境 + +| 参数 | 值 | +|------|-----| +| GPU | NVIDIA A100-SXM4-80GB | +| 模型 | Llama-3.1-8B-Instruct | +| GPU slots | 4 | +| Block size | 1024 tokens | +| Chunk size | 2048 tokens | + +## Sparse Policy 配置 + +| 策略 | Prefill | Decode | 说明 | +|------|---------|--------|------| +| FULL | Full Attention | Full Attention | 基线,加载所有 blocks | +| XATTN_BSA | XAttention (tau=0.95, stride=8) | Full Attention (fallback) | 稀疏 prefill | + +## 测试结果 + +### 32K 上下文 + +| 策略 | 输入长度 | 耗时 | 吞吐量 | 相对性能 | +|------|----------|------|--------|----------| +| Full Attention | 32767 tok | 20.64s | **1587.74 tok/s** | baseline | +| XAttention BSA | 32767 tok | 27.95s | **1172.33 tok/s** | 0.74x | + +### 128K 上下文 + +| 策略 | 输入长度 | 耗时 | 吞吐量 | 相对性能 | +|------|----------|------|--------|----------| +| Full Attention | 131071 tok | 237.18s | **552.63 tok/s** | baseline | +| XAttention BSA | 131071 tok | 281.17s | **466.17 tok/s** | 0.84x | + +### KV Cache 配置 + +| 上下文 | GPU Memory | CPU Memory | Total | +|--------|------------|------------|-------| +| 32K | 512 MB (4 blocks) | 4096 MB (32 blocks) | 4608 MB | +| 128K | 512 MB (4 blocks) | 16384 MB (128 blocks) | 16896 MB | + +## 分析 + +### XAttention 性能特点 + +1. **32K 上下文**: XAttention 比 Full 慢 26% +2. **128K 上下文**: XAttention 比 Full 慢 16% + +随着上下文增长,XAttention 的相对性能有所提升(74% → 84%),但仍未超过 Full Attention。 + +### 原因分析 + +1. **tau=0.95 阈值较高**: 需要覆盖 95% 累积注意力,实际跳过的 block 较少 +2. **估计开销**: `xattn_estimate_chunked` 需要对每个 chunk 计算稀疏 mask +3. **BSA kernel overhead**: Block sparse kernel 有额外的 mask 处理和索引开销 +4. **Offload 瓶颈**: CPU→GPU 传输是主要瓶颈,稀疏注意力节省的是计算而非传输 + +### 适用场景 + +XAttention BSA 更适合以下场景: +- 更长的上下文(256K+),稀疏收益更明显 +- 计算密集型任务(非 offload 模式),传输不是瓶颈 +- 较低的 tau 阈值(如 0.8),增加稀疏性 + +## 运行命令 + +```bash +# Full Attention (32K) +CUDA_VISIBLE_DEVICES=0 python bench_offload.py --max-len 32768 + +# XAttention BSA (32K) +CUDA_VISIBLE_DEVICES=0 python bench_offload.py --max-len 32768 --enable-xattn + +# Full Attention (128K) +CUDA_VISIBLE_DEVICES=0 python bench_offload.py --max-len 131072 + +# XAttention BSA (128K) +CUDA_VISIBLE_DEVICES=0 python bench_offload.py --max-len 131072 --enable-xattn + +# 调整 XAttention 参数 +CUDA_VISIBLE_DEVICES=0 python bench_offload.py --enable-xattn --xattn-threshold 0.8 --xattn-stride 16 +``` + +## 更新记录 + +- 2026-01-27: 初始测试,Llama-3.1-8B-Instruct, A100 80GB