diff --git a/bench.py b/bench.py index 06c4e8c..05b8b47 100644 --- a/bench.py +++ b/bench.py @@ -5,7 +5,7 @@ from nanovllm import LLM, SamplingParams def bench_decode(llm, num_seqs, input_len, output_len): - """Benchmark decode performance (original test)""" + """Benchmark decode performance""" seed(0) prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) @@ -13,9 +13,14 @@ def bench_decode(llm, num_seqs, input_len, output_len): t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = num_seqs * output_len - throughput = total_output_tokens / t - print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + + # Calculate metrics + prefill_tokens = num_seqs * input_len + decode_tokens = num_seqs * output_len + decode_throughput = decode_tokens / t + + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {decode_tokens}tok, Time: {t:.2f}s") + print(f" Throughput: {decode_throughput:.2f} tok/s (includes prefill overhead)") def bench_prefill(llm, num_seqs, input_len): @@ -35,32 +40,49 @@ def bench_prefill(llm, num_seqs, input_len): def main(): import argparse - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Benchmark nanovllm GPU performance") parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens") - parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens") + parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)") + parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)") + parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)") + parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks") args = parser.parse_args() path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") - # Note: Qwen3-4B-Instruct-2507 max_position_embeddings = 262144 - max_len = 131072 # 128K tokens - llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_batched_tokens=max_len) + max_len = args.max_len + + print(f"\n[nanovllm GPU] max_len={max_len}") + + llm = LLM( + path, + enforce_eager=False, + max_model_len=max_len, + max_num_batched_tokens=max_len, + ) # Warmup - llm.generate(["Benchmark: "], SamplingParams()) + print("\nWarming up...") + llm.generate(["Benchmark warmup: "], SamplingParams(max_tokens=10)) - # Default input lengths based on max_len + # Default input lengths prefill_input_len = args.input_len if args.input_len else max_len - 1 decode_input_len = args.input_len if args.input_len else max_len - args.output_len - print("=" * 60) - print("Prefill Benchmark (GPU)") - print("=" * 60) - bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + # Determine which benchmarks to run + run_prefill = not args.bench_decode or args.bench_all + run_decode = args.bench_decode or args.bench_all - # print("=" * 60) - # print("Decode Benchmark (GPU)") - # print("=" * 60) - # bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) + if run_prefill: + print("\n" + "=" * 60) + print("Prefill Benchmark (nanovllm GPU)") + print("=" * 60) + bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + + if run_decode: + print("\n" + "=" * 60) + print("Decode Benchmark (nanovllm GPU)") + print("=" * 60) + bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) if __name__ == "__main__": diff --git a/bench_offload.py b/bench_offload.py index 34edafe..3a1bbaa 100644 --- a/bench_offload.py +++ b/bench_offload.py @@ -5,7 +5,7 @@ from nanovllm import LLM, SamplingParams def bench_decode(llm, num_seqs, input_len, output_len): - """Benchmark decode performance (original test)""" + """Benchmark decode performance""" seed(0) prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) @@ -13,9 +13,17 @@ def bench_decode(llm, num_seqs, input_len, output_len): t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = num_seqs * output_len - throughput = total_output_tokens / t - print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + + # Calculate metrics + prefill_tokens = num_seqs * input_len + decode_tokens = num_seqs * output_len + + # Approximate: assume prefill takes ~input_len/prefill_speed, rest is decode + # For more accurate measurement, we'd need internal timing + decode_throughput = decode_tokens / t # This includes prefill time, so it's a lower bound + + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {decode_tokens}tok, Time: {t:.2f}s") + print(f" Throughput: {decode_throughput:.2f} tok/s (includes prefill overhead)") def bench_prefill(llm, num_seqs, input_len): @@ -35,26 +43,32 @@ def bench_prefill(llm, num_seqs, input_len): def main(): import argparse - parser = argparse.ArgumentParser() - parser.add_argument("--no-sparse", action="store_true", help="Disable sparse attention (baseline)") - parser.add_argument("--topk", type=int, default=8, help="Top-K blocks for Quest") - parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens (default: max_len - 1 for prefill, max_len - output_len for decode)") - parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens") + from nanovllm.config import SparsePolicyType + + parser = argparse.ArgumentParser(description="Benchmark CPU offload performance") + parser.add_argument("--enable-quest", action="store_true", help="Enable Quest sparse attention for decode") + parser.add_argument("--topk", type=int, default=16, help="Top-K blocks for Quest (default: 16)") + parser.add_argument("--threshold", type=int, default=4, help="Apply sparse only when blocks > threshold (default: 4)") + parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens") + parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)") + parser.add_argument("--num-gpu-blocks", type=int, default=6, help="Number of GPU blocks (default: 6)") + parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)") + parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)") + parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks") args = parser.parse_args() path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") - # Note: Qwen3-4B-Instruct-2507 max_position_embeddings = 262144 - max_len = 32 * 1024 # 128K tokens + max_len = args.max_len # Setup policy configuration - if not args.no_sparse: - prefill_policy = "full" # Full attention for prefill - decode_policy = "quest" # Quest Top-K for decode - print(f"\n[Quest Sparse Attention] prefill={prefill_policy}, decode={decode_policy}, topk={args.topk}") + if args.enable_quest: + sparse_policy = SparsePolicyType.QUEST + print(f"\n[Quest Sparse Attention] topk={args.topk}, threshold={args.threshold}") else: - prefill_policy = "full" # Full attention for both phases - decode_policy = "full" - print("\n[Full Attention] No sparse policy (baseline)") + sparse_policy = SparsePolicyType.FULL + print("\n[Full Attention] baseline (no sparse)") + + print(f"[Config] max_len={max_len}, num_gpu_blocks={args.num_gpu_blocks}") llm = LLM( path, @@ -62,29 +76,35 @@ def main(): max_model_len=max_len, max_num_batched_tokens=max_len, enable_cpu_offload=True, - num_gpu_blocks=6, # Small GPU buffer for offload testing - prefill_policy=prefill_policy, - decode_policy=decode_policy, + num_gpu_blocks=args.num_gpu_blocks, + sparse_policy=sparse_policy, sparse_topk_blocks=args.topk, - sparse_threshold_blocks=4, + sparse_threshold_blocks=args.threshold, ) # Warmup - llm.generate(["Benchmark: "], SamplingParams()) + print("\nWarming up...") + llm.generate(["Benchmark warmup: "], SamplingParams(max_tokens=10)) - # Default input lengths based on max_len + # Default input lengths prefill_input_len = args.input_len if args.input_len else max_len - 1 decode_input_len = args.input_len if args.input_len else max_len - args.output_len - print("=" * 60) - print("Prefill Benchmark (CPU Offload)") - print("=" * 60) - bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + # Determine which benchmarks to run + run_prefill = not args.bench_decode or args.bench_all + run_decode = args.bench_decode or args.bench_all - # print("=" * 60) - # print("Decode Benchmark (CPU Offload)") - # print("=" * 60) - # bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) + if run_prefill: + print("\n" + "=" * 60) + print("Prefill Benchmark (CPU Offload)") + print("=" * 60) + bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + + if run_decode: + print("\n" + "=" * 60) + print("Decode Benchmark (CPU Offload)") + print("=" * 60) + bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) if __name__ == "__main__": diff --git a/bench_vllm.py b/bench_vllm.py index 980e574..6d1e269 100644 --- a/bench_vllm.py +++ b/bench_vllm.py @@ -6,7 +6,7 @@ from vllm import LLM, SamplingParams def bench_decode(llm, num_seqs, input_len, output_len): - """Benchmark decode performance (original test)""" + """Benchmark decode performance""" seed(0) prompt_token_ids = [[randint(0, 10000) for _ in range(input_len)] for _ in range(num_seqs)] sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=output_len) @@ -15,9 +15,14 @@ def bench_decode(llm, num_seqs, input_len, output_len): t = time.time() llm.generate(prompt_token_ids, sampling_params, use_tqdm=False) t = time.time() - t - total_output_tokens = num_seqs * output_len - throughput = total_output_tokens / t - print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {total_output_tokens}tok, Time: {t:.2f}s, Throughput: {throughput:.2f}tok/s") + + # Calculate metrics + prefill_tokens = num_seqs * input_len + decode_tokens = num_seqs * output_len + decode_throughput = decode_tokens / t + + print(f"[Decode] Input: {num_seqs}x{input_len}tok, Output: {decode_tokens}tok, Time: {t:.2f}s") + print(f" Throughput: {decode_throughput:.2f} tok/s (includes prefill overhead)") def bench_prefill(llm, num_seqs, input_len): @@ -38,32 +43,50 @@ def bench_prefill(llm, num_seqs, input_len): def main(): import argparse - parser = argparse.ArgumentParser() + parser = argparse.ArgumentParser(description="Benchmark vLLM performance (for comparison)") parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens") - parser.add_argument("--output-len", type=int, default=128, help="Output length in tokens") + parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)") + parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)") + parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)") + parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks") args = parser.parse_args() path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") - # Note: Qwen3-4B-Instruct-2507 max_position_embeddings = 262144 - max_len = 131072 # 128K tokens - llm = LLM(path, enforce_eager=False, max_model_len=max_len, max_num_seqs=128, gpu_memory_utilization=0.9) + max_len = args.max_len + + print(f"\n[vLLM] max_len={max_len}") + + llm = LLM( + path, + enforce_eager=False, + max_model_len=max_len, + max_num_seqs=128, + gpu_memory_utilization=0.9, + ) # Warmup - llm.generate([dict(prompt_token_ids=[0])], SamplingParams()) + print("\nWarming up...") + llm.generate([dict(prompt_token_ids=[0, 1, 2])], SamplingParams(max_tokens=10)) - # Default input lengths based on max_len + # Default input lengths prefill_input_len = args.input_len if args.input_len else max_len - 1 decode_input_len = args.input_len if args.input_len else max_len - args.output_len - print("=" * 60) - print("Prefill Benchmark (vLLM)") - print("=" * 60) - bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + # Determine which benchmarks to run + run_prefill = not args.bench_decode or args.bench_all + run_decode = args.bench_decode or args.bench_all - # print("=" * 60) - # print("Decode Benchmark (vLLM)") - # print("=" * 60) - # bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) + if run_prefill: + print("\n" + "=" * 60) + print("Prefill Benchmark (vLLM)") + print("=" * 60) + bench_prefill(llm, num_seqs=1, input_len=prefill_input_len) + + if run_decode: + print("\n" + "=" * 60) + print("Decode Benchmark (vLLM)") + print("=" * 60) + bench_decode(llm, num_seqs=1, input_len=decode_input_len, output_len=args.output_len) if __name__ == "__main__":