From a6cc703d733715c49f71545339d9e6aabb4c9790 Mon Sep 17 00:00:00 2001 From: Zijie Tian Date: Mon, 12 Jan 2026 00:16:37 +0800 Subject: [PATCH] [tests] Added test_niah_standalone.py. --- docs/ruler_niah_standalone_test.md | 297 ++++++++++++++++++++++++ nanovllm/config.py | 9 + nanovllm/engine/model_runner.py | 16 +- nanovllm/kvcache/__init__.py | 8 +- nanovllm/models/__init__.py | 8 +- tests/test_ruler_niah.py | 357 +++++++++++++++++++++++++++++ 6 files changed, 686 insertions(+), 9 deletions(-) create mode 100644 docs/ruler_niah_standalone_test.md create mode 100644 tests/test_ruler_niah.py diff --git a/docs/ruler_niah_standalone_test.md b/docs/ruler_niah_standalone_test.md new file mode 100644 index 0000000..a3309b4 --- /dev/null +++ b/docs/ruler_niah_standalone_test.md @@ -0,0 +1,297 @@ +# RULER NIAH Standalone Test Plan + +## Overview + +This document describes how to independently test nano-vllm's CPU offload functionality using RULER benchmark's NIAH (Needle-In-A-Haystack) task data. + +## Background + +### Problem Being Investigated + +When running 32K sequence length tests with CPU offload mode, the model outputs garbled text instead of finding the magic number. This issue was traced to: + +- **Root Cause**: Ring buffer `max_seq_len` was set equal to `max_model_len` (32768) +- **Issue**: When prefill uses ~32K tokens, decode needs to store KV at position 32768+, but ring buffer only has indices 0-32767 +- **Fix Applied**: In `nanovllm/kvcache/__init__.py`, changed `max_seq_len = max_model_len + 512` + +### Test Objective + +Verify that the fix works correctly by running a standalone test with actual RULER NIAH data. + +## Step 1: Copy Test Data + +### Source Location + +``` +/home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl +``` + +### Data Format + +Each line is a JSON object: + +```json +{ + "index": 0, + "input": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA special magic number is hidden within the following text...", + "outputs": ["8930103"], + "length": 32768 +} +``` + +- `input`: Full prompt with Llama 3.1 chat template (~122K characters, ~30K tokens) +- `outputs`: Expected answer (the magic number to find) +- `length`: Target sequence length in tokens + +### Copy Command + +```bash +mkdir -p /home/zijie/Code/nano-vllm/tests/data/ruler_niah +cp /home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl \ + /home/zijie/Code/nano-vllm/tests/data/ruler_niah/niah_single_1_32k.jsonl +``` + +## Step 2: Create Test Script + +Create `/home/zijie/Code/nano-vllm/tests/test_ruler_niah_32k.py`: + +```python +""" +Standalone test for RULER NIAH task with 32K context length. + +This test verifies that CPU offload mode correctly handles long sequences +where prefill tokens approach max_model_len. + +Usage: + python tests/test_ruler_niah_32k.py +""" + +import json +import torch +from pathlib import Path + +from nanovllm import LLM +from nanovllm.config import SamplingParams + +# Configuration +MODEL_PATH = "/data/models/Llama-3.1-8B-Instruct" +DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl" +MAX_MODEL_LEN = 32768 +MAX_NEW_TOKENS = 50 + +# CPU Offload Settings +ENABLE_CPU_OFFLOAD = True +NUM_GPU_BLOCKS = 4 +BLOCK_SIZE = 1024 + + +def load_test_sample(filepath: Path, index: int = 0) -> dict: + """Load a single test sample from JSONL file.""" + with open(filepath) as f: + for i, line in enumerate(f): + if i == index: + return json.loads(line) + raise ValueError(f"Sample index {index} not found") + + +def test_niah_single(): + """Test NIAH single needle task with 32K context.""" + print("=" * 60) + print("RULER NIAH 32K Standalone Test") + print("=" * 60) + + # Load test data + sample = load_test_sample(DATA_FILE, index=0) + prompt = sample["input"] + expected = sample["outputs"][0] + + print(f"Prompt length: {len(prompt)} characters") + print(f"Expected answer: {expected}") + print() + + # Initialize model with CPU offload + print("Initializing LLM with CPU offload...") + llm = LLM( + model=MODEL_PATH, + max_model_len=MAX_MODEL_LEN, + enable_cpu_offload=ENABLE_CPU_OFFLOAD, + num_gpu_blocks=NUM_GPU_BLOCKS, + kvcache_block_size=BLOCK_SIZE, + enforce_eager=True, # Disable CUDA graphs for debugging + ) + + # Generate + print("Generating response...") + sampling_params = SamplingParams( + temperature=0.0, # Greedy + max_tokens=MAX_NEW_TOKENS, + ) + + outputs = llm.generate([prompt], sampling_params) + generated_text = outputs[0].outputs[0].text + + print() + print("=" * 60) + print("Results") + print("=" * 60) + print(f"Expected: {expected}") + print(f"Generated: {generated_text[:200]}...") + print() + + # Check if expected number is in output + if expected in generated_text: + print("SUCCESS: Magic number found in output!") + return True + else: + print("FAILED: Magic number NOT found in output") + print(f"Full output: {generated_text}") + return False + + +def test_multiple_samples(num_samples: int = 5): + """Test multiple NIAH samples.""" + print("=" * 60) + print(f"Testing {num_samples} NIAH samples with 32K context") + print("=" * 60) + + # Initialize model once + llm = LLM( + model=MODEL_PATH, + max_model_len=MAX_MODEL_LEN, + enable_cpu_offload=ENABLE_CPU_OFFLOAD, + num_gpu_blocks=NUM_GPU_BLOCKS, + kvcache_block_size=BLOCK_SIZE, + enforce_eager=True, + ) + + sampling_params = SamplingParams( + temperature=0.0, + max_tokens=MAX_NEW_TOKENS, + ) + + correct = 0 + for i in range(num_samples): + sample = load_test_sample(DATA_FILE, index=i) + prompt = sample["input"] + expected = sample["outputs"][0] + + outputs = llm.generate([prompt], sampling_params) + generated_text = outputs[0].outputs[0].text + + if expected in generated_text: + print(f"Sample {i}: PASS (found {expected})") + correct += 1 + else: + print(f"Sample {i}: FAIL (expected {expected}, got: {generated_text[:50]}...)") + + print() + print(f"Accuracy: {correct}/{num_samples} ({100*correct/num_samples:.1f}%)") + return correct == num_samples + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1 and sys.argv[1] == "--all": + success = test_multiple_samples(5) + else: + success = test_niah_single() + + sys.exit(0 if success else 1) +``` + +## Step 3: Run Test + +### Single Sample Test + +```bash +cd /home/zijie/Code/nano-vllm +CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py +``` + +### All 5 Samples + +```bash +cd /home/zijie/Code/nano-vllm +CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py --all +``` + +## Step 4: Expected Results + +### Before Fix (Bug) + +- Output: Garbled text like "not only has been replaced by thesiums..." +- Score: 0% (magic number not found) +- Time: ~80 seconds per sample + +### After Fix (Expected) + +- Output: The magic number (e.g., "8930103") +- Score: ~100% (magic number found) +- Time: ~80 seconds per sample (same, as the compute is unchanged) + +## Debugging Tips + +### Enable Verbose Logging + +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +### Check Ring Buffer Size + +In the logs, verify: +``` +OffloadEngine initializing: num_layers=32, num_kv_buffers=4, max_seq_len=33280 +``` + +The `max_seq_len` should be `32768 + 512 = 33280` (not 32768). + +### Monitor GPU Memory + +```bash +watch -n 1 nvidia-smi +``` + +With CPU offload, GPU memory for KV cache should be ~640MB (ring buffer only). + +## Related Files + +| File | Description | +|------|-------------| +| `nanovllm/kvcache/__init__.py` | Fix location: `max_seq_len = max_model_len + 512` | +| `nanovllm/kvcache/offload_engine.py` | Ring buffer allocation | +| `nanovllm/engine/model_runner.py` | Layer-wise offload prefill/decode | +| `nanovllm/kvcache/hybrid_manager.py` | CPU block management | + +## Test Data Details + +### NIAH Task Description + +The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a specific piece of information (the "needle") from a large context (the "haystack"). + +- **Needle**: A magic number associated with a keyword (e.g., "worried-purse") +- **Haystack**: ~30K tokens of distractor text +- **Task**: Extract the magic number when asked + +### Sample Prompt Structure + +``` +<|begin_of_text|><|start_header_id|>user<|end_header_id|> + +A special magic number is hidden within the following text. Make sure to memorize it. I will quiz you about the number afterwards. + +[... ~30K tokens of haystack text ...] + +The special magic number for worried-purse is 8930103. + +[... more haystack text ...] + +What is the special magic number for worried-purse mentioned in the provided text? +<|eot_id|><|start_header_id|>assistant<|end_header_id|> + + The special magic number for worried-purse mentioned in the provided text is +``` + +The model should complete with: `8930103` diff --git a/nanovllm/config.py b/nanovllm/config.py index 993264a..540a8a6 100644 --- a/nanovllm/config.py +++ b/nanovllm/config.py @@ -61,6 +61,15 @@ class Config: self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings) assert self.max_num_batched_tokens >= self.max_model_len + # CPU offload mode only supports single sequence (layer-wise processing) + if self.enable_cpu_offload and self.max_num_seqs != 1: + import logging + logging.warning( + f"CPU offload mode only supports single sequence. " + f"Overriding max_num_seqs from {self.max_num_seqs} to 1." + ) + self.max_num_seqs = 1 + # Override torch_dtype if user specified if self.dtype is not None: dtype_map = { diff --git a/nanovllm/engine/model_runner.py b/nanovllm/engine/model_runner.py index 07bfe59..eda4f7c 100644 --- a/nanovllm/engine/model_runner.py +++ b/nanovllm/engine/model_runner.py @@ -27,7 +27,9 @@ class ModelRunner: self.rank = rank self.event = event - dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank) + import os + port = os.environ.get("NANOVLLM_DIST_PORT", "2333") + dist.init_process_group("nccl", f"tcp://localhost:{port}", world_size=self.world_size, rank=rank) torch.cuda.set_device(rank) default_dtype = torch.get_default_dtype() torch.set_default_dtype(hf_config.torch_dtype) @@ -546,8 +548,8 @@ class ModelRunner: k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) - # Q/K norms (Qwen3 specific) - if not layer.self_attn.qkv_bias: + # Q/K norms (Qwen3 specific - only when qkv_bias=False) + if not getattr(layer.self_attn, 'qkv_bias', True): num_tokens = q.shape[0] q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim)) q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim) @@ -649,8 +651,8 @@ class ModelRunner: k_new = k_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) v_new = v_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) - # Q/K norms - if not layer.self_attn.qkv_bias: + # Q/K norms (Qwen3 specific - only when qkv_bias=False) + if not getattr(layer.self_attn, 'qkv_bias', True): q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim)) q = q.view(1, layer.self_attn.num_heads, layer.self_attn.head_dim) k_new = layer.self_attn.k_norm(k_new.reshape(-1, layer.self_attn.head_dim)) @@ -785,8 +787,8 @@ class ModelRunner: k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim) - # Q/K norms (Qwen3 specific) - if not layer.self_attn.qkv_bias: + # Q/K norms (Qwen3 specific - only when qkv_bias=False) + if not getattr(layer.self_attn, 'qkv_bias', True): num_tokens = q.shape[0] q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim)) q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim) diff --git a/nanovllm/kvcache/__init__.py b/nanovllm/kvcache/__init__.py index 3e44b31..74e42a2 100644 --- a/nanovllm/kvcache/__init__.py +++ b/nanovllm/kvcache/__init__.py @@ -71,6 +71,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager: threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4), ) + # max_seq_len needs to be larger than max_model_len to accommodate decode tokens + # When prefill uses ~max_model_len tokens, decode needs additional slots + # Add max_new_tokens (default 512) buffer for decode phase + max_new_tokens = getattr(config, 'max_new_tokens', 512) + max_seq_len = config.max_model_len + max_new_tokens + return HybridKVCacheManager( num_gpu_slots=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks, @@ -78,7 +84,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager: policy=eviction_policy, sparse_policy=sparse_policy, num_kv_buffers=getattr(config, 'num_kv_buffers', 4), - max_seq_len=config.max_model_len, + max_seq_len=max_seq_len, ) diff --git a/nanovllm/models/__init__.py b/nanovllm/models/__init__.py index 28d41b2..b7bbce3 100644 --- a/nanovllm/models/__init__.py +++ b/nanovllm/models/__init__.py @@ -3,7 +3,13 @@ from nanovllm.models.registry import register_model, get_model_class, MODEL_REGISTRY # Import models to trigger registration -from nanovllm.models import qwen3 +# Qwen3 requires transformers>=4.51.0 for Qwen3Config +try: + from nanovllm.models import qwen3 +except ImportError as e: + import warnings + warnings.warn(f"Qwen3 model not available (requires transformers>=4.51.0): {e}") + from nanovllm.models import llama __all__ = ["register_model", "get_model_class", "MODEL_REGISTRY"] diff --git a/tests/test_ruler_niah.py b/tests/test_ruler_niah.py new file mode 100644 index 0000000..d39b747 --- /dev/null +++ b/tests/test_ruler_niah.py @@ -0,0 +1,357 @@ +""" +RULER NIAH benchmark test for LLM. + +Tests: Long context retrieval capability using pre-generated RULER benchmark data. +The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a +specific magic number from a large context (~32K tokens). + +Usage: + # Test all samples with CPU offload + python tests/test_ruler_niah.py --enable-offload + + # Test specific samples + python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload + + # Test with custom model + python tests/test_ruler_niah.py --model /path/to/model --enable-offload +""" + +import os +os.environ["NANOVLLM_LOG_LEVEL"] = "INFO" + +import argparse +import json +from pathlib import Path +from typing import List, Tuple, Optional + +from nanovllm import LLM, SamplingParams +from utils import check_needle_answer + + +# ============================================================ +# Constants +# ============================================================ + +DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl" +DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct") +DEFAULT_MAX_MODEL_LEN = 32768 +DEFAULT_MAX_NEW_TOKENS = 50 + + +# ============================================================ +# Data Loading +# ============================================================ + +def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]: + """ + Load RULER NIAH samples from a JSONL file. + + Args: + filepath: Path to the JSONL file + indices: Optional list of sample indices to load. If None, load all. + + Returns: + List of sample dicts with keys: index, input, outputs, length + """ + if not filepath.exists(): + raise FileNotFoundError( + f"Data file not found: {filepath}\n" + f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md" + ) + + samples = [] + with open(filepath) as f: + for i, line in enumerate(f): + if indices is None or i in indices: + sample = json.loads(line) + samples.append(sample) + + if not samples: + raise ValueError(f"No samples loaded from {filepath}") + + return samples + + +def count_samples(filepath: Path) -> int: + """Count total samples in JSONL file.""" + with open(filepath) as f: + return sum(1 for _ in f) + + +# ============================================================ +# Test Function +# ============================================================ + +def run_ruler_niah_test( + model_path: str, + data_file: Path, + sample_indices: Optional[List[int]] = None, + max_model_len: int = DEFAULT_MAX_MODEL_LEN, + max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS, + enable_cpu_offload: bool = False, + num_gpu_blocks: int = 4, + block_size: int = 1024, + gpu_utilization: float = 0.9, + enforce_eager: bool = True, + verbose: bool = True, +) -> Tuple[int, int]: + """ + Run RULER NIAH test on loaded samples. + + Args: + model_path: Path to the model + data_file: Path to JSONL data file + sample_indices: List of sample indices to test (None = all) + max_model_len: Maximum model context length + max_new_tokens: Maximum tokens to generate + enable_cpu_offload: Enable CPU offload mode + num_gpu_blocks: Number of GPU blocks for offload + block_size: KV cache block size + gpu_utilization: GPU memory utilization fraction + enforce_eager: Disable CUDA graphs + verbose: Print detailed output + + Returns: + (correct, total): Number of correct and total samples + """ + # Load samples + samples = load_ruler_samples(data_file, sample_indices) + total = len(samples) + + if verbose: + print(f"\n{'='*60}") + print(f"RULER NIAH Test") + print(f"{'='*60}") + print(f"Model: {model_path}") + print(f"Data file: {data_file}") + print(f"Samples: {total}") + print(f"Max model len: {max_model_len}") + print(f"Max new tokens: {max_new_tokens}") + print(f"CPU offload: {enable_cpu_offload}") + if enable_cpu_offload: + print(f" num_gpu_blocks: {num_gpu_blocks}") + print(f" block_size: {block_size}") + print(f"Enforce eager: {enforce_eager}") + print(f"{'='*60}\n") + + # Check max_model_len vs data length + max_data_len = max(s.get("length", 0) for s in samples) + if max_model_len < max_data_len: + print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})") + print(f" This may cause truncation or errors.\n") + + # Initialize LLM + if verbose: + print("Initializing LLM...") + + llm_kwargs = { + "max_model_len": max_model_len, + "max_num_batched_tokens": max_model_len, + "enforce_eager": enforce_eager, + "gpu_memory_utilization": gpu_utilization, + "kvcache_block_size": block_size, + "enable_cpu_offload": enable_cpu_offload, + } + + if enable_cpu_offload: + llm_kwargs["num_gpu_blocks"] = num_gpu_blocks + + llm = LLM(model_path, **llm_kwargs) + + # Sampling params + # Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead + sampling_params = SamplingParams( + temperature=0.1, # Low temperature for near-deterministic output + max_tokens=max_new_tokens, + ) + + # Test each sample + correct = 0 + results = [] + + for i, sample in enumerate(samples): + sample_idx = sample.get("index", i) + prompt = sample["input"] + expected = sample["outputs"][0] + data_len = sample.get("length", "unknown") + + if verbose: + print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}") + + # Generate + outputs = llm.generate([prompt], sampling_params, use_tqdm=False) + output_text = outputs[0]["text"] + output_tokens = outputs[0]["token_ids"] + + # Check result + passed = check_needle_answer(output_text, expected) + if passed: + correct += 1 + + results.append({ + "index": sample_idx, + "expected": expected, + "output": output_text, + "passed": passed, + }) + + if verbose: + status = "PASS" if passed else "FAIL" + output_preview = output_text[:100].replace('\n', ' ') + print(f" Output ({len(output_tokens)} tokens): {output_preview}...") + print(f" Status: {status}") + + # Summary + if verbose: + print(f"\n{'='*60}") + print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)") + print(f"{'='*60}\n") + + if correct < total: + print("Failed samples:") + for r in results: + if not r["passed"]: + print(f" Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...") + + return correct, total + + +# ============================================================ +# CLI Entry Point +# ============================================================ + +def parse_indices(s: str) -> List[int]: + """Parse comma-separated indices like '0,1,2' or range like '0-4'.""" + if not s: + return None + indices = [] + for part in s.split(','): + if '-' in part: + start, end = part.split('-') + indices.extend(range(int(start), int(end) + 1)) + else: + indices.append(int(part)) + return indices + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="RULER NIAH benchmark test for long context LLM", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Test all samples with CPU offload (recommended for 24GB GPUs) + python tests/test_ruler_niah.py --enable-offload + + # Test specific samples + python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload + + # Test with CUDA graph enabled + python tests/test_ruler_niah.py --enable-offload --use-cuda-graph + """ + ) + + parser.add_argument( + "--model", "-m", + type=str, + default=DEFAULT_MODEL, + help=f"Path to model (default: {DEFAULT_MODEL})" + ) + parser.add_argument( + "--data-file", + type=str, + default=str(DEFAULT_DATA_FILE), + help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})" + ) + parser.add_argument( + "--sample-indices", + type=str, + default="", + help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all" + ) + parser.add_argument( + "--max-model-len", + type=int, + default=DEFAULT_MAX_MODEL_LEN, + help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})" + ) + parser.add_argument( + "--max-new-tokens", + type=int, + default=DEFAULT_MAX_NEW_TOKENS, + help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})" + ) + parser.add_argument( + "--enable-offload", + action="store_true", + help="Enable CPU offload mode (required for 24GB GPUs with 32K context)" + ) + parser.add_argument( + "--num-gpu-blocks", + type=int, + default=4, + help="Number of GPU blocks for CPU offload (default: 4)" + ) + parser.add_argument( + "--block-size", + type=int, + default=1024, + help="KV cache block size (default: 1024)" + ) + parser.add_argument( + "--gpu-utilization", + type=float, + default=0.9, + help="GPU memory utilization fraction (default: 0.9)" + ) + parser.add_argument( + "--enforce-eager", + action="store_true", + default=True, + help="Force eager execution, disable CUDA graphs (default: True)" + ) + parser.add_argument( + "--use-cuda-graph", + action="store_true", + help="Enable CUDA graph (overrides --enforce-eager)" + ) + parser.add_argument( + "--verbose", + action="store_true", + default=True, + help="Print detailed output (default: True)" + ) + parser.add_argument( + "--quiet", "-q", + action="store_true", + help="Quiet mode, only print final result" + ) + + args = parser.parse_args() + + # Process arguments + sample_indices = parse_indices(args.sample_indices) + enforce_eager = not args.use_cuda_graph + verbose = not args.quiet + + # Run test + correct, total = run_ruler_niah_test( + model_path=os.path.expanduser(args.model), + data_file=Path(args.data_file), + sample_indices=sample_indices, + max_model_len=args.max_model_len, + max_new_tokens=args.max_new_tokens, + enable_cpu_offload=args.enable_offload, + num_gpu_blocks=args.num_gpu_blocks, + block_size=args.block_size, + gpu_utilization=args.gpu_utilization, + enforce_eager=enforce_eager, + verbose=verbose, + ) + + # Final status + if correct == total: + print("test_ruler_niah: PASSED") + else: + print(f"test_ruler_niah: FAILED ({correct}/{total})") + exit(1)