From 59473fa4325f0a6e17ac971440abc44d94abe858 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Tue, 27 Jan 2026 09:07:49 +0800 Subject: [PATCH] =?UTF-8?q?=F0=9F=94=A7=20chore:=20add=20configurable=20ar?= =?UTF-8?q?guments=20to=20bench=5Fvllm.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --model, --gpu-util, and --enforce-eager arguments for flexible vLLM benchmarking comparisons. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- bench_vllm.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bench_vllm.py b/bench_vllm.py index 6d1e269..483b311 100644 --- a/bench_vllm.py +++ b/bench_vllm.py @@ -44,24 +44,28 @@ def bench_prefill(llm, num_seqs, input_len): def main(): import argparse parser = argparse.ArgumentParser(description="Benchmark vLLM performance (for comparison)") + parser.add_argument("--model", type=str, default="~/models/Llama-3.1-8B-Instruct", + help="Model path (default: ~/models/Llama-3.1-8B-Instruct)") parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens") parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)") parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)") + parser.add_argument("--gpu-util", type=float, default=0.9, help="GPU memory utilization (default: 0.9)") + parser.add_argument("--enforce-eager", action="store_true", help="Disable CUDA Graphs (use eager mode)") parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)") parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks") args = parser.parse_args() - path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/") + path = os.path.expanduser(args.model) max_len = args.max_len - print(f"\n[vLLM] max_len={max_len}") + print(f"\n[vLLM] max_len={max_len}, gpu_util={args.gpu_util}, enforce_eager={args.enforce_eager}") llm = LLM( path, - enforce_eager=False, + enforce_eager=args.enforce_eager, max_model_len=max_len, max_num_seqs=128, - gpu_memory_utilization=0.9, + gpu_memory_utilization=args.gpu_util, ) # Warmup