[tests] Added test_niah_standalone.py.

2026-01-12 00:16:37 +08:00
parent 5895de0c97
commit a6cc703d73
6 changed files with 686 additions and 9 deletions
--- a/docs/ruler_niah_standalone_test.md
+++ b/docs/ruler_niah_standalone_test.md
@@ -0,0 +1,297 @@
 # RULER NIAH Standalone Test Plan
 ## Overview
 This document describes how to independently test nano-vllm's CPU offload functionality using RULER benchmark's NIAH (Needle-In-A-Haystack) task data.
 ## Background
 ### Problem Being Investigated
 When running 32K sequence length tests with CPU offload mode, the model outputs garbled text instead of finding the magic number. This issue was traced to:
 - **Root Cause**: Ring buffer `max_seq_len` was set equal to `max_model_len` (32768)
 - **Issue**: When prefill uses ~32K tokens, decode needs to store KV at position 32768+, but ring buffer only has indices 0-32767
 - **Fix Applied**: In `nanovllm/kvcache/__init__.py`, changed `max_seq_len = max_model_len + 512`
 ### Test Objective
 Verify that the fix works correctly by running a standalone test with actual RULER NIAH data.
 ## Step 1: Copy Test Data
 ### Source Location
 ```
 /home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl
 ```
 ### Data Format
 Each line is a JSON object:
 ```json
 {
  "index": 0,
  "input": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA special magic number is hidden within the following text...",
  "outputs": ["8930103"],
  "length": 32768
 }
 ```
 - `input`: Full prompt with Llama 3.1 chat template (~122K characters, ~30K tokens)
 - `outputs`: Expected answer (the magic number to find)
 - `length`: Target sequence length in tokens
 ### Copy Command
 ```bash
 mkdir -p /home/zijie/Code/nano-vllm/tests/data/ruler_niah
 cp /home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl \
   /home/zijie/Code/nano-vllm/tests/data/ruler_niah/niah_single_1_32k.jsonl
 ```
 ## Step 2: Create Test Script
 Create `/home/zijie/Code/nano-vllm/tests/test_ruler_niah_32k.py`:
 ```python
 """
 Standalone test for RULER NIAH task with 32K context length.
 This test verifies that CPU offload mode correctly handles long sequences
 where prefill tokens approach max_model_len.
 Usage:
    python tests/test_ruler_niah_32k.py
 """
 import json
 import torch
 from pathlib import Path
 from nanovllm import LLM
 from nanovllm.config import SamplingParams
 # Configuration
 MODEL_PATH = "/data/models/Llama-3.1-8B-Instruct"
 DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
 MAX_MODEL_LEN = 32768
 MAX_NEW_TOKENS = 50
 # CPU Offload Settings
 ENABLE_CPU_OFFLOAD = True
 NUM_GPU_BLOCKS = 4
 BLOCK_SIZE = 1024
 def load_test_sample(filepath: Path, index: int = 0) -> dict:
    """Load a single test sample from JSONL file."""
    with open(filepath) as f:
        for i, line in enumerate(f):
            if i == index:
                return json.loads(line)
    raise ValueError(f"Sample index {index} not found")
 def test_niah_single():
    """Test NIAH single needle task with 32K context."""
    print("=" * 60)
    print("RULER NIAH 32K Standalone Test")
    print("=" * 60)
    # Load test data
    sample = load_test_sample(DATA_FILE, index=0)
    prompt = sample["input"]
    expected = sample["outputs"][0]
    print(f"Prompt length: {len(prompt)} characters")
    print(f"Expected answer: {expected}")
    print()
    # Initialize model with CPU offload
    print("Initializing LLM with CPU offload...")
    llm = LLM(
        model=MODEL_PATH,
        max_model_len=MAX_MODEL_LEN,
        enable_cpu_offload=ENABLE_CPU_OFFLOAD,
        num_gpu_blocks=NUM_GPU_BLOCKS,
        kvcache_block_size=BLOCK_SIZE,
        enforce_eager=True,  # Disable CUDA graphs for debugging
    )
    # Generate
    print("Generating response...")
    sampling_params = SamplingParams(
        temperature=0.0,  # Greedy
        max_tokens=MAX_NEW_TOKENS,
    )
    outputs = llm.generate([prompt], sampling_params)
    generated_text = outputs[0].outputs[0].text
    print()
    print("=" * 60)
    print("Results")
    print("=" * 60)
    print(f"Expected: {expected}")
    print(f"Generated: {generated_text[:200]}...")
    print()
    # Check if expected number is in output
    if expected in generated_text:
        print("SUCCESS: Magic number found in output!")
        return True
    else:
        print("FAILED: Magic number NOT found in output")
        print(f"Full output: {generated_text}")
        return False
 def test_multiple_samples(num_samples: int = 5):
    """Test multiple NIAH samples."""
    print("=" * 60)
    print(f"Testing {num_samples} NIAH samples with 32K context")
    print("=" * 60)
    # Initialize model once
    llm = LLM(
        model=MODEL_PATH,
        max_model_len=MAX_MODEL_LEN,
        enable_cpu_offload=ENABLE_CPU_OFFLOAD,
        num_gpu_blocks=NUM_GPU_BLOCKS,
        kvcache_block_size=BLOCK_SIZE,
        enforce_eager=True,
    )
    sampling_params = SamplingParams(
        temperature=0.0,
        max_tokens=MAX_NEW_TOKENS,
    )
    correct = 0
    for i in range(num_samples):
        sample = load_test_sample(DATA_FILE, index=i)
        prompt = sample["input"]
        expected = sample["outputs"][0]
        outputs = llm.generate([prompt], sampling_params)
        generated_text = outputs[0].outputs[0].text
        if expected in generated_text:
            print(f"Sample {i}: PASS (found {expected})")
            correct += 1
        else:
            print(f"Sample {i}: FAIL (expected {expected}, got: {generated_text[:50]}...)")
    print()
    print(f"Accuracy: {correct}/{num_samples} ({100*correct/num_samples:.1f}%)")
    return correct == num_samples
 if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "--all":
        success = test_multiple_samples(5)
    else:
        success = test_niah_single()
    sys.exit(0 if success else 1)
 ```
 ## Step 3: Run Test
 ### Single Sample Test
 ```bash
 cd /home/zijie/Code/nano-vllm
 CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py
 ```
 ### All 5 Samples
 ```bash
 cd /home/zijie/Code/nano-vllm
 CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py --all
 ```
 ## Step 4: Expected Results
 ### Before Fix (Bug)
 - Output: Garbled text like "not only has been replaced by thesiums..."
 - Score: 0% (magic number not found)
 - Time: ~80 seconds per sample
 ### After Fix (Expected)
 - Output: The magic number (e.g., "8930103")
 - Score: ~100% (magic number found)
 - Time: ~80 seconds per sample (same, as the compute is unchanged)
 ## Debugging Tips
 ### Enable Verbose Logging
 ```python
 import logging
 logging.basicConfig(level=logging.DEBUG)
 ```
 ### Check Ring Buffer Size
 In the logs, verify:
 ```
 OffloadEngine initializing: num_layers=32, num_kv_buffers=4, max_seq_len=33280
 ```
 The `max_seq_len` should be `32768 + 512 = 33280` (not 32768).
 ### Monitor GPU Memory
 ```bash
 watch -n 1 nvidia-smi
 ```
 With CPU offload, GPU memory for KV cache should be ~640MB (ring buffer only).
 ## Related Files
 | File | Description |
 |------|-------------|
 | `nanovllm/kvcache/__init__.py` | Fix location: `max_seq_len = max_model_len + 512` |
 | `nanovllm/kvcache/offload_engine.py` | Ring buffer allocation |
 | `nanovllm/engine/model_runner.py` | Layer-wise offload prefill/decode |
 | `nanovllm/kvcache/hybrid_manager.py` | CPU block management |
 ## Test Data Details
 ### NIAH Task Description
 The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a specific piece of information (the "needle") from a large context (the "haystack").
 - **Needle**: A magic number associated with a keyword (e.g., "worried-purse")
 - **Haystack**: ~30K tokens of distractor text
 - **Task**: Extract the magic number when asked
 ### Sample Prompt Structure
 ```
 <|begin_of_text|><|start_header_id|>user<|end_header_id|>
 A special magic number is hidden within the following text. Make sure to memorize it. I will quiz you about the number afterwards.
 [... ~30K tokens of haystack text ...]
 The special magic number for worried-purse is 8930103.
 [... more haystack text ...]
 What is the special magic number for worried-purse mentioned in the provided text?
 <|eot_id|><|start_header_id|>assistant<|end_header_id|>
 The special magic number for worried-purse mentioned in the provided text is
 ```
 The model should complete with: `8930103`
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -61,6 +61,15 @@ class Config:
        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
        assert self.max_num_batched_tokens >= self.max_model_len
        # CPU offload mode only supports single sequence (layer-wise processing)
        if self.enable_cpu_offload and self.max_num_seqs != 1:
            import logging
            logging.warning(
                f"CPU offload mode only supports single sequence. "
                f"Overriding max_num_seqs from {self.max_num_seqs} to 1."
            )
            self.max_num_seqs = 1
        # Override torch_dtype if user specified
        if self.dtype is not None:
            dtype_map = {
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -27,7 +27,9 @@ class ModelRunner:
        self.rank = rank
        self.event = event
-        dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
+        import os
        port = os.environ.get("NANOVLLM_DIST_PORT", "2333")
        dist.init_process_group("nccl", f"tcp://localhost:{port}", world_size=self.world_size, rank=rank)
        torch.cuda.set_device(rank)
        default_dtype = torch.get_default_dtype()
        torch.set_default_dtype(hf_config.torch_dtype)
@@ -546,8 +548,8 @@ class ModelRunner:
            k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
            v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
-            # Q/K norms (Qwen3 specific)
+            # Q/K norms (Qwen3 specific - only when qkv_bias=False)
-            if not layer.self_attn.qkv_bias:
+            if not getattr(layer.self_attn, 'qkv_bias', True):
                num_tokens = q.shape[0]
                q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
@@ -649,8 +651,8 @@ class ModelRunner:
            k_new = k_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
            v_new = v_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
-            # Q/K norms
+            # Q/K norms (Qwen3 specific - only when qkv_bias=False)
-            if not layer.self_attn.qkv_bias:
+            if not getattr(layer.self_attn, 'qkv_bias', True):
                q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                q = q.view(1, layer.self_attn.num_heads, layer.self_attn.head_dim)
                k_new = layer.self_attn.k_norm(k_new.reshape(-1, layer.self_attn.head_dim))
@@ -785,8 +787,8 @@ class ModelRunner:
                k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
                v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
-                # Q/K norms (Qwen3 specific)
+                # Q/K norms (Qwen3 specific - only when qkv_bias=False)
-                if not layer.self_attn.qkv_bias:
+                if not getattr(layer.self_attn, 'qkv_bias', True):
                    num_tokens = q.shape[0]
                    q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                    q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -71,6 +71,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
        threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
    )
    # max_seq_len needs to be larger than max_model_len to accommodate decode tokens
    # When prefill uses ~max_model_len tokens, decode needs additional slots
    # Add max_new_tokens (default 512) buffer for decode phase
    max_new_tokens = getattr(config, 'max_new_tokens', 512)
    max_seq_len = config.max_model_len + max_new_tokens
    return HybridKVCacheManager(
        num_gpu_slots=num_gpu_blocks,
        num_cpu_blocks=num_cpu_blocks,
@@ -78,7 +84,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
        policy=eviction_policy,
        sparse_policy=sparse_policy,
        num_kv_buffers=getattr(config, 'num_kv_buffers', 4),
-        max_seq_len=config.max_model_len,
+        max_seq_len=max_seq_len,
    )
--- a/nanovllm/models/init.py
+++ b/nanovllm/models/init.py
@@ -3,7 +3,13 @@
 from nanovllm.models.registry import register_model, get_model_class, MODEL_REGISTRY
 # Import models to trigger registration
 # Qwen3 requires transformers>=4.51.0 for Qwen3Config
 try:
    from nanovllm.models import qwen3
 except ImportError as e:
    import warnings
    warnings.warn(f"Qwen3 model not available (requires transformers>=4.51.0): {e}")
 from nanovllm.models import llama
 __all__ = ["register_model", "get_model_class", "MODEL_REGISTRY"]
--- a/tests/test_ruler_niah.py
+++ b/tests/test_ruler_niah.py
@@ -0,0 +1,357 @@
 """
 RULER NIAH benchmark test for LLM.
 Tests: Long context retrieval capability using pre-generated RULER benchmark data.
 The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a
 specific magic number from a large context (~32K tokens).
 Usage:
    # Test all samples with CPU offload
    python tests/test_ruler_niah.py --enable-offload
    # Test specific samples
    python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
    # Test with custom model
    python tests/test_ruler_niah.py --model /path/to/model --enable-offload
 """
 import os
 os.environ["NANOVLLM_LOG_LEVEL"] = "INFO"
 import argparse
 import json
 from pathlib import Path
 from typing import List, Tuple, Optional
 from nanovllm import LLM, SamplingParams
 from utils import check_needle_answer
 # ============================================================
 # Constants
 # ============================================================
 DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
 DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
 DEFAULT_MAX_MODEL_LEN = 32768
 DEFAULT_MAX_NEW_TOKENS = 50
 # ============================================================
 # Data Loading
 # ============================================================
 def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]:
    """
    Load RULER NIAH samples from a JSONL file.
    Args:
        filepath: Path to the JSONL file
        indices: Optional list of sample indices to load. If None, load all.
    Returns:
        List of sample dicts with keys: index, input, outputs, length
    """
    if not filepath.exists():
        raise FileNotFoundError(
            f"Data file not found: {filepath}\n"
            f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md"
        )
    samples = []
    with open(filepath) as f:
        for i, line in enumerate(f):
            if indices is None or i in indices:
                sample = json.loads(line)
                samples.append(sample)
    if not samples:
        raise ValueError(f"No samples loaded from {filepath}")
    return samples
 def count_samples(filepath: Path) -> int:
    """Count total samples in JSONL file."""
    with open(filepath) as f:
        return sum(1 for _ in f)
 # ============================================================
 # Test Function
 # ============================================================
 def run_ruler_niah_test(
    model_path: str,
    data_file: Path,
    sample_indices: Optional[List[int]] = None,
    max_model_len: int = DEFAULT_MAX_MODEL_LEN,
    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
    enable_cpu_offload: bool = False,
    num_gpu_blocks: int = 4,
    block_size: int = 1024,
    gpu_utilization: float = 0.9,
    enforce_eager: bool = True,
    verbose: bool = True,
 ) -> Tuple[int, int]:
    """
    Run RULER NIAH test on loaded samples.
    Args:
        model_path: Path to the model
        data_file: Path to JSONL data file
        sample_indices: List of sample indices to test (None = all)
        max_model_len: Maximum model context length
        max_new_tokens: Maximum tokens to generate
        enable_cpu_offload: Enable CPU offload mode
        num_gpu_blocks: Number of GPU blocks for offload
        block_size: KV cache block size
        gpu_utilization: GPU memory utilization fraction
        enforce_eager: Disable CUDA graphs
        verbose: Print detailed output
    Returns:
        (correct, total): Number of correct and total samples
    """
    # Load samples
    samples = load_ruler_samples(data_file, sample_indices)
    total = len(samples)
    if verbose:
        print(f"\n{'='*60}")
        print(f"RULER NIAH Test")
        print(f"{'='*60}")
        print(f"Model: {model_path}")
        print(f"Data file: {data_file}")
        print(f"Samples: {total}")
        print(f"Max model len: {max_model_len}")
        print(f"Max new tokens: {max_new_tokens}")
        print(f"CPU offload: {enable_cpu_offload}")
        if enable_cpu_offload:
            print(f"  num_gpu_blocks: {num_gpu_blocks}")
            print(f"  block_size: {block_size}")
        print(f"Enforce eager: {enforce_eager}")
        print(f"{'='*60}\n")
    # Check max_model_len vs data length
    max_data_len = max(s.get("length", 0) for s in samples)
    if max_model_len < max_data_len:
        print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})")
        print(f"         This may cause truncation or errors.\n")
    # Initialize LLM
    if verbose:
        print("Initializing LLM...")
    llm_kwargs = {
        "max_model_len": max_model_len,
        "max_num_batched_tokens": max_model_len,
        "enforce_eager": enforce_eager,
        "gpu_memory_utilization": gpu_utilization,
        "kvcache_block_size": block_size,
        "enable_cpu_offload": enable_cpu_offload,
    }
    if enable_cpu_offload:
        llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
    llm = LLM(model_path, **llm_kwargs)
    # Sampling params
    # Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead
    sampling_params = SamplingParams(
        temperature=0.1,  # Low temperature for near-deterministic output
        max_tokens=max_new_tokens,
    )
    # Test each sample
    correct = 0
    results = []
    for i, sample in enumerate(samples):
        sample_idx = sample.get("index", i)
        prompt = sample["input"]
        expected = sample["outputs"][0]
        data_len = sample.get("length", "unknown")
        if verbose:
            print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}")
        # Generate
        outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
        output_text = outputs[0]["text"]
        output_tokens = outputs[0]["token_ids"]
        # Check result
        passed = check_needle_answer(output_text, expected)
        if passed:
            correct += 1
        results.append({
            "index": sample_idx,
            "expected": expected,
            "output": output_text,
            "passed": passed,
        })
        if verbose:
            status = "PASS" if passed else "FAIL"
            output_preview = output_text[:100].replace('\n', ' ')
            print(f"  Output ({len(output_tokens)} tokens): {output_preview}...")
            print(f"  Status: {status}")
    # Summary
    if verbose:
        print(f"\n{'='*60}")
        print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)")
        print(f"{'='*60}\n")
        if correct < total:
            print("Failed samples:")
            for r in results:
                if not r["passed"]:
                    print(f"  Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...")
    return correct, total
 # ============================================================
 # CLI Entry Point
 # ============================================================
 def parse_indices(s: str) -> List[int]:
    """Parse comma-separated indices like '0,1,2' or range like '0-4'."""
    if not s:
        return None
    indices = []
    for part in s.split(','):
        if '-' in part:
            start, end = part.split('-')
            indices.extend(range(int(start), int(end) + 1))
        else:
            indices.append(int(part))
    return indices
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="RULER NIAH benchmark test for long context LLM",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 Examples:
  # Test all samples with CPU offload (recommended for 24GB GPUs)
  python tests/test_ruler_niah.py --enable-offload
  # Test specific samples
  python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
  # Test with CUDA graph enabled
  python tests/test_ruler_niah.py --enable-offload --use-cuda-graph
        """
    )
    parser.add_argument(
        "--model", "-m",
        type=str,
        default=DEFAULT_MODEL,
        help=f"Path to model (default: {DEFAULT_MODEL})"
    )
    parser.add_argument(
        "--data-file",
        type=str,
        default=str(DEFAULT_DATA_FILE),
        help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})"
    )
    parser.add_argument(
        "--sample-indices",
        type=str,
        default="",
        help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all"
    )
    parser.add_argument(
        "--max-model-len",
        type=int,
        default=DEFAULT_MAX_MODEL_LEN,
        help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})"
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=DEFAULT_MAX_NEW_TOKENS,
        help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})"
    )
    parser.add_argument(
        "--enable-offload",
        action="store_true",
        help="Enable CPU offload mode (required for 24GB GPUs with 32K context)"
    )
    parser.add_argument(
        "--num-gpu-blocks",
        type=int,
        default=4,
        help="Number of GPU blocks for CPU offload (default: 4)"
    )
    parser.add_argument(
        "--block-size",
        type=int,
        default=1024,
        help="KV cache block size (default: 1024)"
    )
    parser.add_argument(
        "--gpu-utilization",
        type=float,
        default=0.9,
        help="GPU memory utilization fraction (default: 0.9)"
    )
    parser.add_argument(
        "--enforce-eager",
        action="store_true",
        default=True,
        help="Force eager execution, disable CUDA graphs (default: True)"
    )
    parser.add_argument(
        "--use-cuda-graph",
        action="store_true",
        help="Enable CUDA graph (overrides --enforce-eager)"
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=True,
        help="Print detailed output (default: True)"
    )
    parser.add_argument(
        "--quiet", "-q",
        action="store_true",
        help="Quiet mode, only print final result"
    )
    args = parser.parse_args()
    # Process arguments
    sample_indices = parse_indices(args.sample_indices)
    enforce_eager = not args.use_cuda_graph
    verbose = not args.quiet
    # Run test
    correct, total = run_ruler_niah_test(
        model_path=os.path.expanduser(args.model),
        data_file=Path(args.data_file),
        sample_indices=sample_indices,
        max_model_len=args.max_model_len,
        max_new_tokens=args.max_new_tokens,
        enable_cpu_offload=args.enable_offload,
        num_gpu_blocks=args.num_gpu_blocks,
        block_size=args.block_size,
        gpu_utilization=args.gpu_utilization,
        enforce_eager=enforce_eager,
        verbose=verbose,
    )
    # Final status
    if correct == total:
        print("test_ruler_niah: PASSED")
    else:
        print(f"test_ruler_niah: FAILED ({correct}/{total})")
        exit(1)