feat: add --enforce-eager option to bench.py

Allow disabling CUDA graphs for benchmarking comparison between
eager mode and graph mode execution.

Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
This commit is contained in:
Zijie Tian
2026-01-27 09:19:53 +08:00
parent 6da116de98
commit 9177b62d7f

View File

@@ -58,6 +58,8 @@ def main():
help="Enable sparse policy routing (FullAttentionPolicy by default)") help="Enable sparse policy routing (FullAttentionPolicy by default)")
parser.add_argument("--gpu-util", type=float, default=0.9, parser.add_argument("--gpu-util", type=float, default=0.9,
help="GPU memory utilization (default: 0.9)") help="GPU memory utilization (default: 0.9)")
parser.add_argument("--enforce-eager", action="store_true",
help="Disable CUDA graphs (default: False)")
args = parser.parse_args() args = parser.parse_args()
path = os.path.expanduser(args.model) path = os.path.expanduser(args.model)
@@ -76,7 +78,7 @@ def main():
llm = LLM( llm = LLM(
path, path,
enforce_eager=False, enforce_eager=args.enforce_eager,
max_model_len=max_len, max_model_len=max_len,
max_num_batched_tokens=max_len, max_num_batched_tokens=max_len,
sparse_policy=sparse_policy, sparse_policy=sparse_policy,