[tests] Added test_niah_standalone.py.

2026-01-12 00:16:37 +08:00
parent 5895de0c97
commit a6cc703d73
6 changed files with 686 additions and 9 deletions
--- a/docs/ruler_niah_standalone_test.md
+++ b/docs/ruler_niah_standalone_test.md
@@ -0,0 +1,297 @@
+# RULER NIAH Standalone Test Plan
+
+## Overview
+
+This document describes how to independently test nano-vllm's CPU offload functionality using RULER benchmark's NIAH (Needle-In-A-Haystack) task data.
+
+## Background
+
+### Problem Being Investigated
+
+When running 32K sequence length tests with CPU offload mode, the model outputs garbled text instead of finding the magic number. This issue was traced to:
+
+- **Root Cause**: Ring buffer `max_seq_len` was set equal to `max_model_len` (32768)
+- **Issue**: When prefill uses ~32K tokens, decode needs to store KV at position 32768+, but ring buffer only has indices 0-32767
+- **Fix Applied**: In `nanovllm/kvcache/__init__.py`, changed `max_seq_len = max_model_len + 512`
+
+### Test Objective
+
+Verify that the fix works correctly by running a standalone test with actual RULER NIAH data.
+
+## Step 1: Copy Test Data
+
+### Source Location
+
+```
+/home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl
+```
+
+### Data Format
+
+Each line is a JSON object:
+
+```json
+{
+  "index": 0,
+  "input": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA special magic number is hidden within the following text...",
+  "outputs": ["8930103"],
+  "length": 32768
+}
+```
+
+- `input`: Full prompt with Llama 3.1 chat template (~122K characters, ~30K tokens)
+- `outputs`: Expected answer (the magic number to find)
+- `length`: Target sequence length in tokens
+
+### Copy Command
+
+```bash
+mkdir -p /home/zijie/Code/nano-vllm/tests/data/ruler_niah
+cp /home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl \
+   /home/zijie/Code/nano-vllm/tests/data/ruler_niah/niah_single_1_32k.jsonl
+```
+
+## Step 2: Create Test Script
+
+Create `/home/zijie/Code/nano-vllm/tests/test_ruler_niah_32k.py`:
+
+```python
+"""
+Standalone test for RULER NIAH task with 32K context length.
+
+This test verifies that CPU offload mode correctly handles long sequences
+where prefill tokens approach max_model_len.
+
+Usage:
+    python tests/test_ruler_niah_32k.py
+"""
+
+import json
+import torch
+from pathlib import Path
+
+from nanovllm import LLM
+from nanovllm.config import SamplingParams
+
+# Configuration
+MODEL_PATH = "/data/models/Llama-3.1-8B-Instruct"
+DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
+MAX_MODEL_LEN = 32768
+MAX_NEW_TOKENS = 50
+
+# CPU Offload Settings
+ENABLE_CPU_OFFLOAD = True
+NUM_GPU_BLOCKS = 4
+BLOCK_SIZE = 1024
+
+
+def load_test_sample(filepath: Path, index: int = 0) -> dict:
+    """Load a single test sample from JSONL file."""
+    with open(filepath) as f:
+        for i, line in enumerate(f):
+            if i == index:
+                return json.loads(line)
+    raise ValueError(f"Sample index {index} not found")
+
+
+def test_niah_single():
+    """Test NIAH single needle task with 32K context."""
+    print("=" * 60)
+    print("RULER NIAH 32K Standalone Test")
+    print("=" * 60)
+
+    # Load test data
+    sample = load_test_sample(DATA_FILE, index=0)
+    prompt = sample["input"]
+    expected = sample["outputs"][0]
+
+    print(f"Prompt length: {len(prompt)} characters")
+    print(f"Expected answer: {expected}")
+    print()
+
+    # Initialize model with CPU offload
+    print("Initializing LLM with CPU offload...")
+    llm = LLM(
+        model=MODEL_PATH,
+        max_model_len=MAX_MODEL_LEN,
+        enable_cpu_offload=ENABLE_CPU_OFFLOAD,
+        num_gpu_blocks=NUM_GPU_BLOCKS,
+        kvcache_block_size=BLOCK_SIZE,
+        enforce_eager=True,  # Disable CUDA graphs for debugging
+    )
+
+    # Generate
+    print("Generating response...")
+    sampling_params = SamplingParams(
+        temperature=0.0,  # Greedy
+        max_tokens=MAX_NEW_TOKENS,
+    )
+
+    outputs = llm.generate([prompt], sampling_params)
+    generated_text = outputs[0].outputs[0].text
+
+    print()
+    print("=" * 60)
+    print("Results")
+    print("=" * 60)
+    print(f"Expected: {expected}")
+    print(f"Generated: {generated_text[:200]}...")
+    print()
+
+    # Check if expected number is in output
+    if expected in generated_text:
+        print("SUCCESS: Magic number found in output!")
+        return True
+    else:
+        print("FAILED: Magic number NOT found in output")
+        print(f"Full output: {generated_text}")
+        return False
+
+
+def test_multiple_samples(num_samples: int = 5):
+    """Test multiple NIAH samples."""
+    print("=" * 60)
+    print(f"Testing {num_samples} NIAH samples with 32K context")
+    print("=" * 60)
+
+    # Initialize model once
+    llm = LLM(
+        model=MODEL_PATH,
+        max_model_len=MAX_MODEL_LEN,
+        enable_cpu_offload=ENABLE_CPU_OFFLOAD,
+        num_gpu_blocks=NUM_GPU_BLOCKS,
+        kvcache_block_size=BLOCK_SIZE,
+        enforce_eager=True,
+    )
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=MAX_NEW_TOKENS,
+    )
+
+    correct = 0
+    for i in range(num_samples):
+        sample = load_test_sample(DATA_FILE, index=i)
+        prompt = sample["input"]
+        expected = sample["outputs"][0]
+
+        outputs = llm.generate([prompt], sampling_params)
+        generated_text = outputs[0].outputs[0].text
+
+        if expected in generated_text:
+            print(f"Sample {i}: PASS (found {expected})")
+            correct += 1
+        else:
+            print(f"Sample {i}: FAIL (expected {expected}, got: {generated_text[:50]}...)")
+
+    print()
+    print(f"Accuracy: {correct}/{num_samples} ({100*correct/num_samples:.1f}%)")
+    return correct == num_samples
+
+
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) > 1 and sys.argv[1] == "--all":
+        success = test_multiple_samples(5)
+    else:
+        success = test_niah_single()
+
+    sys.exit(0 if success else 1)
+```
+
+## Step 3: Run Test
+
+### Single Sample Test
+
+```bash
+cd /home/zijie/Code/nano-vllm
+CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py
+```
+
+### All 5 Samples
+
+```bash
+cd /home/zijie/Code/nano-vllm
+CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py --all
+```
+
+## Step 4: Expected Results
+
+### Before Fix (Bug)
+
+- Output: Garbled text like "not only has been replaced by thesiums..."
+- Score: 0% (magic number not found)
+- Time: ~80 seconds per sample
+
+### After Fix (Expected)
+
+- Output: The magic number (e.g., "8930103")
+- Score: ~100% (magic number found)
+- Time: ~80 seconds per sample (same, as the compute is unchanged)
+
+## Debugging Tips
+
+### Enable Verbose Logging
+
+```python
+import logging
+logging.basicConfig(level=logging.DEBUG)
+```
+
+### Check Ring Buffer Size
+
+In the logs, verify:
+```
+OffloadEngine initializing: num_layers=32, num_kv_buffers=4, max_seq_len=33280
+```
+
+The `max_seq_len` should be `32768 + 512 = 33280` (not 32768).
+
+### Monitor GPU Memory
+
+```bash
+watch -n 1 nvidia-smi
+```
+
+With CPU offload, GPU memory for KV cache should be ~640MB (ring buffer only).
+
+## Related Files
+
+| File | Description |
+|------|-------------|
+| `nanovllm/kvcache/__init__.py` | Fix location: `max_seq_len = max_model_len + 512` |
+| `nanovllm/kvcache/offload_engine.py` | Ring buffer allocation |
+| `nanovllm/engine/model_runner.py` | Layer-wise offload prefill/decode |
+| `nanovllm/kvcache/hybrid_manager.py` | CPU block management |
+
+## Test Data Details
+
+### NIAH Task Description
+
+The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a specific piece of information (the "needle") from a large context (the "haystack").
+
+- **Needle**: A magic number associated with a keyword (e.g., "worried-purse")
+- **Haystack**: ~30K tokens of distractor text
+- **Task**: Extract the magic number when asked
+
+### Sample Prompt Structure
+
+```
+<|begin_of_text|><|start_header_id|>user<|end_header_id|>
+
+A special magic number is hidden within the following text. Make sure to memorize it. I will quiz you about the number afterwards.
+
+[... ~30K tokens of haystack text ...]
+
+The special magic number for worried-purse is 8930103.
+
+[... more haystack text ...]
+
+What is the special magic number for worried-purse mentioned in the provided text?
+<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+ The special magic number for worried-purse mentioned in the provided text is
+```
+
+The model should complete with: `8930103`
--- a/nanovllm/config.py
+++ b/nanovllm/config.py
@@ -61,6 +61,15 @@ class Config:
        self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
        assert self.max_num_batched_tokens >= self.max_model_len

+        # CPU offload mode only supports single sequence (layer-wise processing)
+        if self.enable_cpu_offload and self.max_num_seqs != 1:
+            import logging
+            logging.warning(
+                f"CPU offload mode only supports single sequence. "
+                f"Overriding max_num_seqs from {self.max_num_seqs} to 1."
+            )
+            self.max_num_seqs = 1
+
        # Override torch_dtype if user specified
        if self.dtype is not None:
            dtype_map = {
--- a/nanovllm/engine/model_runner.py
+++ b/nanovllm/engine/model_runner.py
@@ -27,7 +27,9 @@ class ModelRunner:
        self.rank = rank
        self.event = event

-        dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
+        import os
+        port = os.environ.get("NANOVLLM_DIST_PORT", "2333")
+        dist.init_process_group("nccl", f"tcp://localhost:{port}", world_size=self.world_size, rank=rank)
        torch.cuda.set_device(rank)
        default_dtype = torch.get_default_dtype()
        torch.set_default_dtype(hf_config.torch_dtype)
@@ -546,8 +548,8 @@ class ModelRunner:
            k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
            v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)

-            # Q/K norms (Qwen3 specific)
-            if not layer.self_attn.qkv_bias:
+            # Q/K norms (Qwen3 specific - only when qkv_bias=False)
+            if not getattr(layer.self_attn, 'qkv_bias', True):
                num_tokens = q.shape[0]
                q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
@@ -649,8 +651,8 @@ class ModelRunner:
            k_new = k_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
            v_new = v_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)

-            # Q/K norms
-            if not layer.self_attn.qkv_bias:
+            # Q/K norms (Qwen3 specific - only when qkv_bias=False)
+            if not getattr(layer.self_attn, 'qkv_bias', True):
                q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                q = q.view(1, layer.self_attn.num_heads, layer.self_attn.head_dim)
                k_new = layer.self_attn.k_norm(k_new.reshape(-1, layer.self_attn.head_dim))
@@ -785,8 +787,8 @@ class ModelRunner:
                k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
                v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)

-                # Q/K norms (Qwen3 specific)
-                if not layer.self_attn.qkv_bias:
+                # Q/K norms (Qwen3 specific - only when qkv_bias=False)
+                if not getattr(layer.self_attn, 'qkv_bias', True):
                    num_tokens = q.shape[0]
                    q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
                    q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
--- a/nanovllm/kvcache/init.py
+++ b/nanovllm/kvcache/init.py
@@ -71,6 +71,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
        threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
    )

+    # max_seq_len needs to be larger than max_model_len to accommodate decode tokens
+    # When prefill uses ~max_model_len tokens, decode needs additional slots
+    # Add max_new_tokens (default 512) buffer for decode phase
+    max_new_tokens = getattr(config, 'max_new_tokens', 512)
+    max_seq_len = config.max_model_len + max_new_tokens
+
    return HybridKVCacheManager(
        num_gpu_slots=num_gpu_blocks,
        num_cpu_blocks=num_cpu_blocks,
@@ -78,7 +84,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
        policy=eviction_policy,
        sparse_policy=sparse_policy,
        num_kv_buffers=getattr(config, 'num_kv_buffers', 4),
-        max_seq_len=config.max_model_len,
+        max_seq_len=max_seq_len,
    )


--- a/nanovllm/models/init.py
+++ b/nanovllm/models/init.py
@@ -3,7 +3,13 @@
 from nanovllm.models.registry import register_model, get_model_class, MODEL_REGISTRY

 # Import models to trigger registration
+# Qwen3 requires transformers>=4.51.0 for Qwen3Config
+try:
    from nanovllm.models import qwen3
+except ImportError as e:
+    import warnings
+    warnings.warn(f"Qwen3 model not available (requires transformers>=4.51.0): {e}")
+
 from nanovllm.models import llama

 __all__ = ["register_model", "get_model_class", "MODEL_REGISTRY"]
--- a/tests/test_ruler_niah.py
+++ b/tests/test_ruler_niah.py
@@ -0,0 +1,357 @@
+"""
+RULER NIAH benchmark test for LLM.
+
+Tests: Long context retrieval capability using pre-generated RULER benchmark data.
+The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a
+specific magic number from a large context (~32K tokens).
+
+Usage:
+    # Test all samples with CPU offload
+    python tests/test_ruler_niah.py --enable-offload
+
+    # Test specific samples
+    python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
+
+    # Test with custom model
+    python tests/test_ruler_niah.py --model /path/to/model --enable-offload
+"""
+
+import os
+os.environ["NANOVLLM_LOG_LEVEL"] = "INFO"
+
+import argparse
+import json
+from pathlib import Path
+from typing import List, Tuple, Optional
+
+from nanovllm import LLM, SamplingParams
+from utils import check_needle_answer
+
+
+# ============================================================
+# Constants
+# ============================================================
+
+DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
+DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
+DEFAULT_MAX_MODEL_LEN = 32768
+DEFAULT_MAX_NEW_TOKENS = 50
+
+
+# ============================================================
+# Data Loading
+# ============================================================
+
+def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]:
+    """
+    Load RULER NIAH samples from a JSONL file.
+
+    Args:
+        filepath: Path to the JSONL file
+        indices: Optional list of sample indices to load. If None, load all.
+
+    Returns:
+        List of sample dicts with keys: index, input, outputs, length
+    """
+    if not filepath.exists():
+        raise FileNotFoundError(
+            f"Data file not found: {filepath}\n"
+            f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md"
+        )
+
+    samples = []
+    with open(filepath) as f:
+        for i, line in enumerate(f):
+            if indices is None or i in indices:
+                sample = json.loads(line)
+                samples.append(sample)
+
+    if not samples:
+        raise ValueError(f"No samples loaded from {filepath}")
+
+    return samples
+
+
+def count_samples(filepath: Path) -> int:
+    """Count total samples in JSONL file."""
+    with open(filepath) as f:
+        return sum(1 for _ in f)
+
+
+# ============================================================
+# Test Function
+# ============================================================
+
+def run_ruler_niah_test(
+    model_path: str,
+    data_file: Path,
+    sample_indices: Optional[List[int]] = None,
+    max_model_len: int = DEFAULT_MAX_MODEL_LEN,
+    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
+    enable_cpu_offload: bool = False,
+    num_gpu_blocks: int = 4,
+    block_size: int = 1024,
+    gpu_utilization: float = 0.9,
+    enforce_eager: bool = True,
+    verbose: bool = True,
+) -> Tuple[int, int]:
+    """
+    Run RULER NIAH test on loaded samples.
+
+    Args:
+        model_path: Path to the model
+        data_file: Path to JSONL data file
+        sample_indices: List of sample indices to test (None = all)
+        max_model_len: Maximum model context length
+        max_new_tokens: Maximum tokens to generate
+        enable_cpu_offload: Enable CPU offload mode
+        num_gpu_blocks: Number of GPU blocks for offload
+        block_size: KV cache block size
+        gpu_utilization: GPU memory utilization fraction
+        enforce_eager: Disable CUDA graphs
+        verbose: Print detailed output
+
+    Returns:
+        (correct, total): Number of correct and total samples
+    """
+    # Load samples
+    samples = load_ruler_samples(data_file, sample_indices)
+    total = len(samples)
+
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"RULER NIAH Test")
+        print(f"{'='*60}")
+        print(f"Model: {model_path}")
+        print(f"Data file: {data_file}")
+        print(f"Samples: {total}")
+        print(f"Max model len: {max_model_len}")
+        print(f"Max new tokens: {max_new_tokens}")
+        print(f"CPU offload: {enable_cpu_offload}")
+        if enable_cpu_offload:
+            print(f"  num_gpu_blocks: {num_gpu_blocks}")
+            print(f"  block_size: {block_size}")
+        print(f"Enforce eager: {enforce_eager}")
+        print(f"{'='*60}\n")
+
+    # Check max_model_len vs data length
+    max_data_len = max(s.get("length", 0) for s in samples)
+    if max_model_len < max_data_len:
+        print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})")
+        print(f"         This may cause truncation or errors.\n")
+
+    # Initialize LLM
+    if verbose:
+        print("Initializing LLM...")
+
+    llm_kwargs = {
+        "max_model_len": max_model_len,
+        "max_num_batched_tokens": max_model_len,
+        "enforce_eager": enforce_eager,
+        "gpu_memory_utilization": gpu_utilization,
+        "kvcache_block_size": block_size,
+        "enable_cpu_offload": enable_cpu_offload,
+    }
+
+    if enable_cpu_offload:
+        llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
+
+    llm = LLM(model_path, **llm_kwargs)
+
+    # Sampling params
+    # Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead
+    sampling_params = SamplingParams(
+        temperature=0.1,  # Low temperature for near-deterministic output
+        max_tokens=max_new_tokens,
+    )
+
+    # Test each sample
+    correct = 0
+    results = []
+
+    for i, sample in enumerate(samples):
+        sample_idx = sample.get("index", i)
+        prompt = sample["input"]
+        expected = sample["outputs"][0]
+        data_len = sample.get("length", "unknown")
+
+        if verbose:
+            print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}")
+
+        # Generate
+        outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
+        output_text = outputs[0]["text"]
+        output_tokens = outputs[0]["token_ids"]
+
+        # Check result
+        passed = check_needle_answer(output_text, expected)
+        if passed:
+            correct += 1
+
+        results.append({
+            "index": sample_idx,
+            "expected": expected,
+            "output": output_text,
+            "passed": passed,
+        })
+
+        if verbose:
+            status = "PASS" if passed else "FAIL"
+            output_preview = output_text[:100].replace('\n', ' ')
+            print(f"  Output ({len(output_tokens)} tokens): {output_preview}...")
+            print(f"  Status: {status}")
+
+    # Summary
+    if verbose:
+        print(f"\n{'='*60}")
+        print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)")
+        print(f"{'='*60}\n")
+
+        if correct < total:
+            print("Failed samples:")
+            for r in results:
+                if not r["passed"]:
+                    print(f"  Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...")
+
+    return correct, total
+
+
+# ============================================================
+# CLI Entry Point
+# ============================================================
+
+def parse_indices(s: str) -> List[int]:
+    """Parse comma-separated indices like '0,1,2' or range like '0-4'."""
+    if not s:
+        return None
+    indices = []
+    for part in s.split(','):
+        if '-' in part:
+            start, end = part.split('-')
+            indices.extend(range(int(start), int(end) + 1))
+        else:
+            indices.append(int(part))
+    return indices
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="RULER NIAH benchmark test for long context LLM",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Test all samples with CPU offload (recommended for 24GB GPUs)
+  python tests/test_ruler_niah.py --enable-offload
+
+  # Test specific samples
+  python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
+
+  # Test with CUDA graph enabled
+  python tests/test_ruler_niah.py --enable-offload --use-cuda-graph
+        """
+    )
+
+    parser.add_argument(
+        "--model", "-m",
+        type=str,
+        default=DEFAULT_MODEL,
+        help=f"Path to model (default: {DEFAULT_MODEL})"
+    )
+    parser.add_argument(
+        "--data-file",
+        type=str,
+        default=str(DEFAULT_DATA_FILE),
+        help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})"
+    )
+    parser.add_argument(
+        "--sample-indices",
+        type=str,
+        default="",
+        help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all"
+    )
+    parser.add_argument(
+        "--max-model-len",
+        type=int,
+        default=DEFAULT_MAX_MODEL_LEN,
+        help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})"
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=DEFAULT_MAX_NEW_TOKENS,
+        help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})"
+    )
+    parser.add_argument(
+        "--enable-offload",
+        action="store_true",
+        help="Enable CPU offload mode (required for 24GB GPUs with 32K context)"
+    )
+    parser.add_argument(
+        "--num-gpu-blocks",
+        type=int,
+        default=4,
+        help="Number of GPU blocks for CPU offload (default: 4)"
+    )
+    parser.add_argument(
+        "--block-size",
+        type=int,
+        default=1024,
+        help="KV cache block size (default: 1024)"
+    )
+    parser.add_argument(
+        "--gpu-utilization",
+        type=float,
+        default=0.9,
+        help="GPU memory utilization fraction (default: 0.9)"
+    )
+    parser.add_argument(
+        "--enforce-eager",
+        action="store_true",
+        default=True,
+        help="Force eager execution, disable CUDA graphs (default: True)"
+    )
+    parser.add_argument(
+        "--use-cuda-graph",
+        action="store_true",
+        help="Enable CUDA graph (overrides --enforce-eager)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        default=True,
+        help="Print detailed output (default: True)"
+    )
+    parser.add_argument(
+        "--quiet", "-q",
+        action="store_true",
+        help="Quiet mode, only print final result"
+    )
+
+    args = parser.parse_args()
+
+    # Process arguments
+    sample_indices = parse_indices(args.sample_indices)
+    enforce_eager = not args.use_cuda_graph
+    verbose = not args.quiet
+
+    # Run test
+    correct, total = run_ruler_niah_test(
+        model_path=os.path.expanduser(args.model),
+        data_file=Path(args.data_file),
+        sample_indices=sample_indices,
+        max_model_len=args.max_model_len,
+        max_new_tokens=args.max_new_tokens,
+        enable_cpu_offload=args.enable_offload,
+        num_gpu_blocks=args.num_gpu_blocks,
+        block_size=args.block_size,
+        gpu_utilization=args.gpu_utilization,
+        enforce_eager=enforce_eager,
+        verbose=verbose,
+    )
+
+    # Final status
+    if correct == total:
+        print("test_ruler_niah: PASSED")
+    else:
+        print(f"test_ruler_niah: FAILED ({correct}/{total})")
+        exit(1)