nano-vllm/tests/test_ruler_niah.py

"""
RULER NIAH benchmark test for LLM.

Tests: Long context retrieval capability using pre-generated RULER benchmark data.
The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a
specific magic number from a large context (~32K tokens).

Usage:
    # Test all samples with CPU offload
    python tests/test_ruler_niah.py --enable-offload

    # Test specific samples
    python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload

    # Test with custom model
    python tests/test_ruler_niah.py --model /path/to/model --enable-offload

    # Group mode: test in batches with separate LLM initialization per group
    python tests/test_ruler_niah.py --enable-offload --group-size 5
"""

import os
os.environ["NANOVLLM_LOG_LEVEL"] = "INFO"

import argparse
import json
from pathlib import Path
from typing import List, Tuple, Optional

from nanovllm import LLM, SamplingParams
from utils import check_needle_answer


# ============================================================
# Constants
# ============================================================

DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
DEFAULT_MAX_MODEL_LEN = 32768
DEFAULT_MAX_NEW_TOKENS = 50


# ============================================================
# Data Loading
# ============================================================

def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]:
    """
    Load RULER NIAH samples from a JSONL file.

    Args:
        filepath: Path to the JSONL file
        indices: Optional list of sample indices to load. If None, load all.

    Returns:
        List of sample dicts with keys: index, input, outputs, length
    """
    if not filepath.exists():
        raise FileNotFoundError(
            f"Data file not found: {filepath}\n"
            f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md"
        )

    samples = []
    with open(filepath) as f:
        for i, line in enumerate(f):
            if indices is None or i in indices:
                sample = json.loads(line)
                samples.append(sample)

    if not samples:
        raise ValueError(f"No samples loaded from {filepath}")

    return samples


def count_samples(filepath: Path) -> int:
    """Count total samples in JSONL file."""
    with open(filepath) as f:
        return sum(1 for _ in f)


# ============================================================
# Test Function
# ============================================================

def run_ruler_niah_test(
    model_path: str,
    data_file: Path,
    sample_indices: Optional[List[int]] = None,
    max_model_len: int = DEFAULT_MAX_MODEL_LEN,
    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
    enable_cpu_offload: bool = False,
    num_gpu_blocks: int = 4,
    block_size: int = 1024,
    gpu_utilization: float = 0.9,
    enforce_eager: bool = True,
    verbose: bool = True,
) -> Tuple[int, int]:
    """
    Run RULER NIAH test on loaded samples.

    Args:
        model_path: Path to the model
        data_file: Path to JSONL data file
        sample_indices: List of sample indices to test (None = all)
        max_model_len: Maximum model context length
        max_new_tokens: Maximum tokens to generate
        enable_cpu_offload: Enable CPU offload mode
        num_gpu_blocks: Number of GPU blocks for offload
        block_size: KV cache block size
        gpu_utilization: GPU memory utilization fraction
        enforce_eager: Disable CUDA graphs
        verbose: Print detailed output

    Returns:
        (correct, total): Number of correct and total samples
    """
    # Load samples
    samples = load_ruler_samples(data_file, sample_indices)
    total = len(samples)

    if verbose:
        print(f"\n{'='*60}")
        print(f"RULER NIAH Test")
        print(f"{'='*60}")
        print(f"Model: {model_path}")
        print(f"Data file: {data_file}")
        print(f"Samples: {total}")
        print(f"Max model len: {max_model_len}")
        print(f"Max new tokens: {max_new_tokens}")
        print(f"CPU offload: {enable_cpu_offload}")
        if enable_cpu_offload:
            print(f"  num_gpu_blocks: {num_gpu_blocks}")
            print(f"  block_size: {block_size}")
        print(f"Enforce eager: {enforce_eager}")
        print(f"{'='*60}\n")

    # Check max_model_len vs data length
    max_data_len = max(s.get("length", 0) for s in samples)
    if max_model_len < max_data_len:
        print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})")
        print(f"         This may cause truncation or errors.\n")

    # Initialize LLM
    if verbose:
        print("Initializing LLM...")

    llm_kwargs = {
        "max_model_len": max_model_len,
        "max_num_batched_tokens": max_model_len,
        "enforce_eager": enforce_eager,
        "gpu_memory_utilization": gpu_utilization,
        "kvcache_block_size": block_size,
        "enable_cpu_offload": enable_cpu_offload,
    }

    if enable_cpu_offload:
        llm_kwargs["num_gpu_blocks"] = num_gpu_blocks

    llm = LLM(model_path, **llm_kwargs)

    # Sampling params
    # Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead
    sampling_params = SamplingParams(
        temperature=0.1,  # Low temperature for near-deterministic output
        max_tokens=max_new_tokens,
    )

    # Test each sample
    correct = 0
    results = []

    for i, sample in enumerate(samples):
        sample_idx = sample.get("index", i)
        prompt = sample["input"]
        expected = sample["outputs"][0]
        data_len = sample.get("length", "unknown")

        if verbose:
            print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}")

        # Generate
        outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
        output_text = outputs[0]["text"]
        output_tokens = outputs[0]["token_ids"]

        # Check result
        passed = check_needle_answer(output_text, expected)
        if passed:
            correct += 1

        results.append({
            "index": sample_idx,
            "expected": expected,
            "output": output_text,
            "passed": passed,
        })

        if verbose:
            status = "PASS" if passed else "FAIL"
            output_preview = output_text[:100].replace('\n', ' ')
            print(f"  Output ({len(output_tokens)} tokens): {output_preview}...")
            print(f"  Status: {status}")

    # Summary
    if verbose:
        print(f"\n{'='*60}")
        print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)")
        print(f"{'='*60}\n")

        if correct < total:
            print("Failed samples:")
            for r in results:
                if not r["passed"]:
                    print(f"  Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...")

    return correct, total


# ============================================================
# Grouped Test Function
# ============================================================

def run_grouped_test(
    model_path: str,
    data_file: Path,
    group_size: int = 5,
    total_samples: Optional[int] = None,
    max_model_len: int = DEFAULT_MAX_MODEL_LEN,
    max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
    enable_cpu_offload: bool = False,
    num_gpu_blocks: int = 4,
    block_size: int = 1024,
    gpu_utilization: float = 0.9,
    enforce_eager: bool = True,
) -> Tuple[int, int, List[dict]]:
    """
    Run RULER NIAH test in groups, with separate LLM initialization per group.

    This mode is useful for:
    - Avoiding state accumulation issues
    - Testing LLM initialization stability
    - Running large-scale tests with memory cleanup between groups

    Args:
        model_path: Path to the model
        data_file: Path to JSONL data file
        group_size: Number of samples per group
        total_samples: Total samples to test (None = all in file)
        Other args: Same as run_ruler_niah_test

    Returns:
        (total_correct, total_tested, group_results): Results summary
    """
    import time
    import gc
    import torch

    # Count total samples in file
    file_sample_count = count_samples(data_file)
    if total_samples is None:
        total_samples = file_sample_count
    else:
        total_samples = min(total_samples, file_sample_count)

    num_groups = (total_samples + group_size - 1) // group_size

    print(f"\n{'='*60}")
    print(f"RULER NIAH Grouped Test")
    print(f"{'='*60}")
    print(f"Model: {model_path}")
    print(f"Data file: {data_file}")
    print(f"Total samples: {total_samples}")
    print(f"Group size: {group_size}")
    print(f"Number of groups: {num_groups}")
    print(f"CPU offload: {enable_cpu_offload}")
    print(f"{'='*60}\n")

    total_correct = 0
    total_tested = 0
    group_results = []
    all_failed = []

    test_start_time = time.time()

    for group_idx in range(num_groups):
        start_idx = group_idx * group_size
        end_idx = min(start_idx + group_size, total_samples)
        sample_indices = list(range(start_idx, end_idx))

        print(f"\n{'='*60}")
        print(f"Group {group_idx + 1}/{num_groups}: Samples {start_idx}-{end_idx - 1}")
        print(f"{'='*60}")

        group_start_time = time.time()

        # Run test for this group
        correct, tested = run_ruler_niah_test(
            model_path=model_path,
            data_file=data_file,
            sample_indices=sample_indices,
            max_model_len=max_model_len,
            max_new_tokens=max_new_tokens,
            enable_cpu_offload=enable_cpu_offload,
            num_gpu_blocks=num_gpu_blocks,
            block_size=block_size,
            gpu_utilization=gpu_utilization,
            enforce_eager=enforce_eager,
            verbose=True,
        )

        group_time = time.time() - group_start_time

        total_correct += correct
        total_tested += tested

        group_result = {
            "group": group_idx + 1,
            "samples": f"{start_idx}-{end_idx - 1}",
            "correct": correct,
            "total": tested,
            "accuracy": 100 * correct / tested if tested > 0 else 0,
            "time": group_time,
        }
        group_results.append(group_result)

        print(f"\nGroup {group_idx + 1} Summary: {correct}/{tested} PASSED ({group_result['accuracy']:.1f}%) in {group_time:.1f}s")

        # Force cleanup between groups
        gc.collect()
        torch.cuda.empty_cache()

        # Small delay to ensure port is released
        if group_idx < num_groups - 1:
            time.sleep(3)

    total_time = time.time() - test_start_time

    # Final summary
    print(f"\n{'='*60}")
    print(f"FINAL SUMMARY")
    print(f"{'='*60}")
    print(f"\nGroup Results:")
    print(f"{'Group':<8} {'Samples':<12} {'Result':<12} {'Accuracy':<10} {'Time':<10}")
    print(f"{'-'*52}")
    for r in group_results:
        print(f"{r['group']:<8} {r['samples']:<12} {r['correct']}/{r['total']:<9} {r['accuracy']:.1f}%{'':<5} {r['time']:.1f}s")

    print(f"{'-'*52}")
    overall_accuracy = 100 * total_correct / total_tested if total_tested > 0 else 0
    print(f"{'TOTAL':<8} {'0-' + str(total_tested-1):<12} {total_correct}/{total_tested:<9} {overall_accuracy:.1f}%{'':<5} {total_time:.1f}s")
    print(f"{'='*60}\n")

    return total_correct, total_tested, group_results


# ============================================================
# CLI Entry Point
# ============================================================

def parse_indices(s: str) -> List[int]:
    """Parse comma-separated indices like '0,1,2' or range like '0-4'."""
    if not s:
        return None
    indices = []
    for part in s.split(','):
        if '-' in part:
            start, end = part.split('-')
            indices.extend(range(int(start), int(end) + 1))
        else:
            indices.append(int(part))
    return indices


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="RULER NIAH benchmark test for long context LLM",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  # Test all samples with CPU offload (recommended for 24GB GPUs)
  python tests/test_ruler_niah.py --enable-offload

  # Test specific samples
  python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload

  # Test with CUDA graph enabled
  python tests/test_ruler_niah.py --enable-offload --use-cuda-graph
        """
    )

    parser.add_argument(
        "--model", "-m",
        type=str,
        default=DEFAULT_MODEL,
        help=f"Path to model (default: {DEFAULT_MODEL})"
    )
    parser.add_argument(
        "--data-file",
        type=str,
        default=str(DEFAULT_DATA_FILE),
        help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})"
    )
    parser.add_argument(
        "--sample-indices",
        type=str,
        default="",
        help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all"
    )
    parser.add_argument(
        "--max-model-len",
        type=int,
        default=DEFAULT_MAX_MODEL_LEN,
        help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})"
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=DEFAULT_MAX_NEW_TOKENS,
        help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})"
    )
    parser.add_argument(
        "--enable-offload",
        action="store_true",
        help="Enable CPU offload mode (required for 24GB GPUs with 32K context)"
    )
    parser.add_argument(
        "--num-gpu-blocks",
        type=int,
        default=4,
        help="Number of GPU blocks for CPU offload (default: 4)"
    )
    parser.add_argument(
        "--block-size",
        type=int,
        default=1024,
        help="KV cache block size (default: 1024)"
    )
    parser.add_argument(
        "--gpu-utilization",
        type=float,
        default=0.9,
        help="GPU memory utilization fraction (default: 0.9)"
    )
    parser.add_argument(
        "--enforce-eager",
        action="store_true",
        default=True,
        help="Force eager execution, disable CUDA graphs (default: True)"
    )
    parser.add_argument(
        "--use-cuda-graph",
        action="store_true",
        help="Enable CUDA graph (overrides --enforce-eager)"
    )
    parser.add_argument(
        "--verbose",
        action="store_true",
        default=True,
        help="Print detailed output (default: True)"
    )
    parser.add_argument(
        "--quiet", "-q",
        action="store_true",
        help="Quiet mode, only print final result"
    )
    parser.add_argument(
        "--group-size",
        type=int,
        default=0,
        help="Enable grouped testing mode with specified group size. Each group initializes LLM separately. (default: 0 = disabled)"
    )
    parser.add_argument(
        "--total-samples",
        type=int,
        default=0,
        help="Total number of samples to test in group mode (default: 0 = all samples in file)"
    )

    args = parser.parse_args()

    # Process arguments
    sample_indices = parse_indices(args.sample_indices)
    enforce_eager = not args.use_cuda_graph
    verbose = not args.quiet

    # Check if group mode is enabled
    if args.group_size > 0:
        # Grouped testing mode
        total_samples = args.total_samples if args.total_samples > 0 else None
        correct, total, _ = run_grouped_test(
            model_path=os.path.expanduser(args.model),
            data_file=Path(args.data_file),
            group_size=args.group_size,
            total_samples=total_samples,
            max_model_len=args.max_model_len,
            max_new_tokens=args.max_new_tokens,
            enable_cpu_offload=args.enable_offload,
            num_gpu_blocks=args.num_gpu_blocks,
            block_size=args.block_size,
            gpu_utilization=args.gpu_utilization,
            enforce_eager=enforce_eager,
        )
    else:
        # Standard testing mode
        correct, total = run_ruler_niah_test(
            model_path=os.path.expanduser(args.model),
            data_file=Path(args.data_file),
            sample_indices=sample_indices,
            max_model_len=args.max_model_len,
            max_new_tokens=args.max_new_tokens,
            enable_cpu_offload=args.enable_offload,
            num_gpu_blocks=args.num_gpu_blocks,
            block_size=args.block_size,
            gpu_utilization=args.gpu_utilization,
            enforce_eager=enforce_eager,
            verbose=verbose,
        )

    # Final status
    if correct == total:
        print("test_ruler_niah: PASSED")
    else:
        print(f"test_ruler_niah: FAILED ({correct}/{total})")
        exit(1)