From 9177b62d7fabd2fba32cb187207a361c55614ad2 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Tue, 27 Jan 2026 09:19:53 +0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20--enforce-eager=20opt?= =?UTF-8?q?ion=20to=20bench.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow disabling CUDA graphs for benchmarking comparison between eager mode and graph mode execution. Generated with [Claude Code](https://claude.ai/code) via [Happy](https://happy.engineering) Co-Authored-By: Claude Co-Authored-By: Happy --- bench.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bench.py b/bench.py index 8717ef1..d1dac2e 100644 --- a/bench.py +++ b/bench.py @@ -58,6 +58,8 @@ def main(): help="Enable sparse policy routing (FullAttentionPolicy by default)") parser.add_argument("--gpu-util", type=float, default=0.9, help="GPU memory utilization (default: 0.9)") + parser.add_argument("--enforce-eager", action="store_true", + help="Disable CUDA graphs (default: False)") args = parser.parse_args() path = os.path.expanduser(args.model) @@ -76,7 +78,7 @@ def main(): llm = LLM( path, - enforce_eager=False, + enforce_eager=args.enforce_eager, max_model_len=max_len, max_num_batched_tokens=max_len, sparse_policy=sparse_policy,