[claudesquad] update from 'layer-prefill-1' on 08 Jan 26 03:36 CST

2026-01-08 03:36:39 +08:00
parent 6575099a06
commit d8a87da1c3
10 changed files with 822 additions and 32 deletions
--- a/tests/test_needle.py
+++ b/tests/test_needle.py
@@ -31,8 +31,13 @@ def run_needle_test(
    max_new_tokens: int = 32,
    enable_cpu_offload: bool = False,
    enable_quest: bool = False,
+    enable_minference: bool = False,
    sparse_topk: int = 8,
    sparse_threshold: int = 4,
+    minference_budget: float = 0.3,
+    minference_vertical: int = 1000,
+    minference_slash: int = 6096,
+    gpu_utilization: float = 0.9,
    verbose: bool = True,
 ) -> bool:
    """
@@ -49,14 +54,25 @@ def run_needle_test(
        max_new_tokens: Maximum tokens to generate
        enable_cpu_offload: Enable CPU offload mode
        enable_quest: Enable Quest sparse attention (decode-only Top-K)
+        enable_minference: Enable MInference sparse prefill (GPU-only)
        sparse_topk: Top-K blocks for Quest
        sparse_threshold: Apply sparse only when blocks > threshold
+        minference_budget: MInference adaptive budget (fraction of seq_len, None=fixed mode)
+        minference_vertical: Fixed vertical_size (only used when budget=None)
+        minference_slash: Fixed slash_size (only used when budget=None)
+        gpu_utilization: GPU memory utilization fraction
        verbose: Print detailed output

    Returns:
        True if test passed, False otherwise
    """
-    sparse_policy = SparsePolicyType.QUEST if enable_quest else SparsePolicyType.FULL
+    # Determine sparse policy
+    if enable_minference:
+        sparse_policy = SparsePolicyType.MINFERENCE
+    elif enable_quest:
+        sparse_policy = SparsePolicyType.QUEST
+    else:
+        sparse_policy = SparsePolicyType.FULL

    if verbose:
        print(f"\n{'='*60}")
@@ -69,8 +85,14 @@ def run_needle_test(
        print(f"Needle position: {needle_position:.0%}")
        print(f"Needle value: {needle_value}")
        print(f"CPU offload: {enable_cpu_offload}")
-        if enable_cpu_offload:
-            print(f"Sparse policy: {sparse_policy.name} (topk={sparse_topk}, threshold={sparse_threshold})")
+        print(f"Sparse policy: {sparse_policy.name}")
+        if enable_cpu_offload and enable_quest:
+            print(f"  Quest: topk={sparse_topk}, threshold={sparse_threshold}")
+        if enable_minference:
+            if minference_budget is not None:
+                print(f"  MInference: adaptive (budget={minference_budget})")
+            else:
+                print(f"  MInference: fixed (vertical={minference_vertical}, slash={minference_slash})")
        print(f"{'='*60}\n")

    # 1. Initialize LLM
@@ -80,12 +102,19 @@ def run_needle_test(
        "max_num_batched_tokens": max_model_len,
        "enable_cpu_offload": enable_cpu_offload,
        "kvcache_block_size": block_size,
+        "gpu_memory_utilization": gpu_utilization,
    }
    if enable_cpu_offload:
        llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
        llm_kwargs["sparse_policy"] = sparse_policy
        llm_kwargs["sparse_topk_blocks"] = sparse_topk
        llm_kwargs["sparse_threshold_blocks"] = sparse_threshold
+    elif enable_minference:
+        # MInference is GPU-only sparse prefill
+        llm_kwargs["sparse_policy"] = sparse_policy
+        llm_kwargs["minference_adaptive_budget"] = minference_budget
+        llm_kwargs["minference_vertical_size"] = minference_vertical
+        llm_kwargs["minference_slash_size"] = minference_slash

    llm = LLM(model_path, **llm_kwargs)

@@ -186,6 +215,11 @@ if __name__ == "__main__":
        action="store_true",
        help="Enable Quest sparse attention (decode-only Top-K selection)"
    )
+    parser.add_argument(
+        "--enable-minference",
+        action="store_true",
+        help="Enable MInference sparse prefill (GPU-only, vertical+slash pattern)"
+    )
    parser.add_argument(
        "--sparse-topk",
        type=int,
@@ -198,8 +232,35 @@ if __name__ == "__main__":
        default=4,
        help="Apply sparse only when blocks > threshold"
    )
+    parser.add_argument(
+        "--minference-budget",
+        type=float,
+        default=0.3,
+        help="MInference adaptive budget (fraction of seq_len, 0.3=30%% compute, 0=fixed mode)"
+    )
+    parser.add_argument(
+        "--minference-vertical",
+        type=int,
+        default=1000,
+        help="Fixed vertical_size (only used when budget=0)"
+    )
+    parser.add_argument(
+        "--minference-slash",
+        type=int,
+        default=6096,
+        help="Fixed slash_size (only used when budget=0)"
+    )
+    parser.add_argument(
+        "--gpu-utilization",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization (default: 0.9)"
+    )
    args = parser.parse_args()

+    # Convert budget=0 to None for fixed mode
+    minference_budget = args.minference_budget if args.minference_budget > 0 else None
+
    passed = run_needle_test(
        model_path=args.model,
        max_model_len=args.max_model_len,
@@ -211,8 +272,13 @@ if __name__ == "__main__":
        max_new_tokens=args.max_new_tokens,
        enable_cpu_offload=args.enable_offload,
        enable_quest=args.enable_quest,
+        enable_minference=args.enable_minference,
        sparse_topk=args.sparse_topk,
        sparse_threshold=args.sparse_threshold,
+        minference_budget=minference_budget,
+        minference_vertical=args.minference_vertical,
+        minference_slash=args.minference_slash,
+        gpu_utilization=args.gpu_utilization,
        verbose=True,
    )