[claudesquad] update from 'int-minference-1' on 08 Jan 26 23:22 CST

This commit is contained in:
Zijie Tian
2026-01-08 23:22:38 +08:00
parent 0bfe1984ef
commit ea4e904de0
11 changed files with 853 additions and 533 deletions

View File

@@ -106,12 +106,15 @@ def run_needle_test(
}
if enable_cpu_offload:
llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
llm_kwargs["sparse_policy"] = sparse_policy
llm_kwargs["sparse_topk_blocks"] = sparse_topk
llm_kwargs["sparse_threshold_blocks"] = sparse_threshold
elif enable_minference:
# MInference is GPU-only sparse prefill
# Set sparse policy (can be used with or without offload)
if enable_minference or enable_quest:
llm_kwargs["sparse_policy"] = sparse_policy
# MInference params (works with both GPU-only and offload mode)
if enable_minference:
llm_kwargs["minference_adaptive_budget"] = minference_budget
llm_kwargs["minference_vertical_size"] = minference_vertical
llm_kwargs["minference_slash_size"] = minference_slash