[claudesquad] update from 'int-minference-1' on 08 Jan 26 23:22 CST

2026-01-08 23:22:38 +08:00
parent 0bfe1984ef
commit ea4e904de0
11 changed files with 853 additions and 533 deletions
--- a/tests/test_needle.py
+++ b/tests/test_needle.py
@@ -106,12 +106,15 @@ def run_needle_test(
    }
    if enable_cpu_offload:
        llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
-        llm_kwargs["sparse_policy"] = sparse_policy
        llm_kwargs["sparse_topk_blocks"] = sparse_topk
        llm_kwargs["sparse_threshold_blocks"] = sparse_threshold
-    elif enable_minference:
-        # MInference is GPU-only sparse prefill
+
+    # Set sparse policy (can be used with or without offload)
+    if enable_minference or enable_quest:
        llm_kwargs["sparse_policy"] = sparse_policy
+
+    # MInference params (works with both GPU-only and offload mode)
+    if enable_minference:
        llm_kwargs["minference_adaptive_budget"] = minference_budget
        llm_kwargs["minference_vertical_size"] = minference_vertical
        llm_kwargs["minference_slash_size"] = minference_slash