From 9177b62d7fabd2fba32cb187207a361c55614ad2 Mon Sep 17 00:00:00 2001
From: Zijie Tian <zijietian@mail.xmu.edu.cn>
Date: Tue, 27 Jan 2026 09:19:53 +0800
Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20feat:=20add=20--enforce-eager=20opt?=
 =?UTF-8?q?ion=20to=20bench.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow disabling CUDA graphs for benchmarking comparison between
eager mode and graph mode execution.

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
---
 bench.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/bench.py b/bench.py
index 8717ef1..d1dac2e 100644
--- a/bench.py
+++ b/bench.py
@@ -58,6 +58,8 @@ def main():
                         help="Enable sparse policy routing (FullAttentionPolicy by default)")
     parser.add_argument("--gpu-util", type=float, default=0.9,
                         help="GPU memory utilization (default: 0.9)")
+    parser.add_argument("--enforce-eager", action="store_true",
+                        help="Disable CUDA graphs (default: False)")
     args = parser.parse_args()
 
     path = os.path.expanduser(args.model)
@@ -76,7 +78,7 @@ def main():
 
     llm = LLM(
         path,
-        enforce_eager=False,
+        enforce_eager=args.enforce_eager,
         max_model_len=max_len,
         max_num_batched_tokens=max_len,
         sparse_policy=sparse_policy,