🔧 chore: add configurable arguments to bench_vllm.py
Add --model, --gpu-util, and --enforce-eager arguments for flexible vLLM benchmarking comparisons. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
@@ -44,24 +44,28 @@ def bench_prefill(llm, num_seqs, input_len):
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Benchmark vLLM performance (for comparison)")
|
||||
parser.add_argument("--model", type=str, default="~/models/Llama-3.1-8B-Instruct",
|
||||
help="Model path (default: ~/models/Llama-3.1-8B-Instruct)")
|
||||
parser.add_argument("--input-len", type=int, default=None, help="Input length in tokens")
|
||||
parser.add_argument("--output-len", type=int, default=64, help="Output length for decode benchmark (default: 64)")
|
||||
parser.add_argument("--max-len", type=int, default=32*1024, help="Max model length (default: 32K)")
|
||||
parser.add_argument("--gpu-util", type=float, default=0.9, help="GPU memory utilization (default: 0.9)")
|
||||
parser.add_argument("--enforce-eager", action="store_true", help="Disable CUDA Graphs (use eager mode)")
|
||||
parser.add_argument("--bench-decode", action="store_true", help="Run decode benchmark (default: prefill only)")
|
||||
parser.add_argument("--bench-all", action="store_true", help="Run both prefill and decode benchmarks")
|
||||
args = parser.parse_args()
|
||||
|
||||
path = os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/")
|
||||
path = os.path.expanduser(args.model)
|
||||
max_len = args.max_len
|
||||
|
||||
print(f"\n[vLLM] max_len={max_len}")
|
||||
print(f"\n[vLLM] max_len={max_len}, gpu_util={args.gpu_util}, enforce_eager={args.enforce_eager}")
|
||||
|
||||
llm = LLM(
|
||||
path,
|
||||
enforce_eager=False,
|
||||
enforce_eager=args.enforce_eager,
|
||||
max_model_len=max_len,
|
||||
max_num_seqs=128,
|
||||
gpu_memory_utilization=0.9,
|
||||
gpu_memory_utilization=args.gpu_util,
|
||||
)
|
||||
|
||||
# Warmup
|
||||
|
||||
Reference in New Issue
Block a user