528 lines
16 KiB
Python
528 lines
16 KiB
Python
"""
|
|
RULER NIAH benchmark test for LLM.
|
|
|
|
Tests: Long context retrieval capability using pre-generated RULER benchmark data.
|
|
The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a
|
|
specific magic number from a large context (~32K tokens).
|
|
|
|
Usage:
|
|
# Test all samples with CPU offload
|
|
python tests/test_ruler_niah.py --enable-offload
|
|
|
|
# Test specific samples
|
|
python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
|
|
|
|
# Test with custom model
|
|
python tests/test_ruler_niah.py --model /path/to/model --enable-offload
|
|
|
|
# Group mode: test in batches with separate LLM initialization per group
|
|
python tests/test_ruler_niah.py --enable-offload --group-size 5
|
|
"""
|
|
|
|
import os
|
|
os.environ["NANOVLLM_LOG_LEVEL"] = "INFO"
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Optional
|
|
|
|
from nanovllm import LLM, SamplingParams
|
|
from utils import check_needle_answer
|
|
|
|
|
|
# ============================================================
|
|
# Constants
|
|
# ============================================================
|
|
|
|
DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
|
|
DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
|
|
DEFAULT_MAX_MODEL_LEN = 32768
|
|
DEFAULT_MAX_NEW_TOKENS = 50
|
|
|
|
|
|
# ============================================================
|
|
# Data Loading
|
|
# ============================================================
|
|
|
|
def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]:
|
|
"""
|
|
Load RULER NIAH samples from a JSONL file.
|
|
|
|
Args:
|
|
filepath: Path to the JSONL file
|
|
indices: Optional list of sample indices to load. If None, load all.
|
|
|
|
Returns:
|
|
List of sample dicts with keys: index, input, outputs, length
|
|
"""
|
|
if not filepath.exists():
|
|
raise FileNotFoundError(
|
|
f"Data file not found: {filepath}\n"
|
|
f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md"
|
|
)
|
|
|
|
samples = []
|
|
with open(filepath) as f:
|
|
for i, line in enumerate(f):
|
|
if indices is None or i in indices:
|
|
sample = json.loads(line)
|
|
samples.append(sample)
|
|
|
|
if not samples:
|
|
raise ValueError(f"No samples loaded from {filepath}")
|
|
|
|
return samples
|
|
|
|
|
|
def count_samples(filepath: Path) -> int:
|
|
"""Count total samples in JSONL file."""
|
|
with open(filepath) as f:
|
|
return sum(1 for _ in f)
|
|
|
|
|
|
# ============================================================
|
|
# Test Function
|
|
# ============================================================
|
|
|
|
def run_ruler_niah_test(
|
|
model_path: str,
|
|
data_file: Path,
|
|
sample_indices: Optional[List[int]] = None,
|
|
max_model_len: int = DEFAULT_MAX_MODEL_LEN,
|
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
enable_cpu_offload: bool = False,
|
|
num_gpu_blocks: int = 4,
|
|
block_size: int = 1024,
|
|
gpu_utilization: float = 0.9,
|
|
enforce_eager: bool = True,
|
|
verbose: bool = True,
|
|
) -> Tuple[int, int]:
|
|
"""
|
|
Run RULER NIAH test on loaded samples.
|
|
|
|
Args:
|
|
model_path: Path to the model
|
|
data_file: Path to JSONL data file
|
|
sample_indices: List of sample indices to test (None = all)
|
|
max_model_len: Maximum model context length
|
|
max_new_tokens: Maximum tokens to generate
|
|
enable_cpu_offload: Enable CPU offload mode
|
|
num_gpu_blocks: Number of GPU blocks for offload
|
|
block_size: KV cache block size
|
|
gpu_utilization: GPU memory utilization fraction
|
|
enforce_eager: Disable CUDA graphs
|
|
verbose: Print detailed output
|
|
|
|
Returns:
|
|
(correct, total): Number of correct and total samples
|
|
"""
|
|
# Load samples
|
|
samples = load_ruler_samples(data_file, sample_indices)
|
|
total = len(samples)
|
|
|
|
if verbose:
|
|
print(f"\n{'='*60}")
|
|
print(f"RULER NIAH Test")
|
|
print(f"{'='*60}")
|
|
print(f"Model: {model_path}")
|
|
print(f"Data file: {data_file}")
|
|
print(f"Samples: {total}")
|
|
print(f"Max model len: {max_model_len}")
|
|
print(f"Max new tokens: {max_new_tokens}")
|
|
print(f"CPU offload: {enable_cpu_offload}")
|
|
if enable_cpu_offload:
|
|
print(f" num_gpu_blocks: {num_gpu_blocks}")
|
|
print(f" block_size: {block_size}")
|
|
print(f"Enforce eager: {enforce_eager}")
|
|
print(f"{'='*60}\n")
|
|
|
|
# Check max_model_len vs data length
|
|
max_data_len = max(s.get("length", 0) for s in samples)
|
|
if max_model_len < max_data_len:
|
|
print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})")
|
|
print(f" This may cause truncation or errors.\n")
|
|
|
|
# Initialize LLM
|
|
if verbose:
|
|
print("Initializing LLM...")
|
|
|
|
llm_kwargs = {
|
|
"max_model_len": max_model_len,
|
|
"max_num_batched_tokens": max_model_len,
|
|
"enforce_eager": enforce_eager,
|
|
"gpu_memory_utilization": gpu_utilization,
|
|
"kvcache_block_size": block_size,
|
|
"enable_cpu_offload": enable_cpu_offload,
|
|
}
|
|
|
|
if enable_cpu_offload:
|
|
llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
|
|
|
|
llm = LLM(model_path, **llm_kwargs)
|
|
|
|
# Sampling params
|
|
# Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead
|
|
sampling_params = SamplingParams(
|
|
temperature=0.1, # Low temperature for near-deterministic output
|
|
max_tokens=max_new_tokens,
|
|
)
|
|
|
|
# Test each sample
|
|
correct = 0
|
|
results = []
|
|
|
|
for i, sample in enumerate(samples):
|
|
sample_idx = sample.get("index", i)
|
|
prompt = sample["input"]
|
|
expected = sample["outputs"][0]
|
|
data_len = sample.get("length", "unknown")
|
|
|
|
if verbose:
|
|
print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}")
|
|
|
|
# Generate
|
|
outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
|
|
output_text = outputs[0]["text"]
|
|
output_tokens = outputs[0]["token_ids"]
|
|
|
|
# Check result
|
|
passed = check_needle_answer(output_text, expected)
|
|
if passed:
|
|
correct += 1
|
|
|
|
results.append({
|
|
"index": sample_idx,
|
|
"expected": expected,
|
|
"output": output_text,
|
|
"passed": passed,
|
|
})
|
|
|
|
if verbose:
|
|
status = "PASS" if passed else "FAIL"
|
|
output_preview = output_text[:100].replace('\n', ' ')
|
|
print(f" Output ({len(output_tokens)} tokens): {output_preview}...")
|
|
print(f" Status: {status}")
|
|
|
|
# Summary
|
|
if verbose:
|
|
print(f"\n{'='*60}")
|
|
print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)")
|
|
print(f"{'='*60}\n")
|
|
|
|
if correct < total:
|
|
print("Failed samples:")
|
|
for r in results:
|
|
if not r["passed"]:
|
|
print(f" Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...")
|
|
|
|
return correct, total
|
|
|
|
|
|
# ============================================================
|
|
# Grouped Test Function
|
|
# ============================================================
|
|
|
|
def run_grouped_test(
|
|
model_path: str,
|
|
data_file: Path,
|
|
group_size: int = 5,
|
|
total_samples: Optional[int] = None,
|
|
max_model_len: int = DEFAULT_MAX_MODEL_LEN,
|
|
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
|
enable_cpu_offload: bool = False,
|
|
num_gpu_blocks: int = 4,
|
|
block_size: int = 1024,
|
|
gpu_utilization: float = 0.9,
|
|
enforce_eager: bool = True,
|
|
) -> Tuple[int, int, List[dict]]:
|
|
"""
|
|
Run RULER NIAH test in groups, with separate LLM initialization per group.
|
|
|
|
This mode is useful for:
|
|
- Avoiding state accumulation issues
|
|
- Testing LLM initialization stability
|
|
- Running large-scale tests with memory cleanup between groups
|
|
|
|
Args:
|
|
model_path: Path to the model
|
|
data_file: Path to JSONL data file
|
|
group_size: Number of samples per group
|
|
total_samples: Total samples to test (None = all in file)
|
|
Other args: Same as run_ruler_niah_test
|
|
|
|
Returns:
|
|
(total_correct, total_tested, group_results): Results summary
|
|
"""
|
|
import time
|
|
import gc
|
|
import torch
|
|
|
|
# Count total samples in file
|
|
file_sample_count = count_samples(data_file)
|
|
if total_samples is None:
|
|
total_samples = file_sample_count
|
|
else:
|
|
total_samples = min(total_samples, file_sample_count)
|
|
|
|
num_groups = (total_samples + group_size - 1) // group_size
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"RULER NIAH Grouped Test")
|
|
print(f"{'='*60}")
|
|
print(f"Model: {model_path}")
|
|
print(f"Data file: {data_file}")
|
|
print(f"Total samples: {total_samples}")
|
|
print(f"Group size: {group_size}")
|
|
print(f"Number of groups: {num_groups}")
|
|
print(f"CPU offload: {enable_cpu_offload}")
|
|
print(f"{'='*60}\n")
|
|
|
|
total_correct = 0
|
|
total_tested = 0
|
|
group_results = []
|
|
all_failed = []
|
|
|
|
test_start_time = time.time()
|
|
|
|
for group_idx in range(num_groups):
|
|
start_idx = group_idx * group_size
|
|
end_idx = min(start_idx + group_size, total_samples)
|
|
sample_indices = list(range(start_idx, end_idx))
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Group {group_idx + 1}/{num_groups}: Samples {start_idx}-{end_idx - 1}")
|
|
print(f"{'='*60}")
|
|
|
|
group_start_time = time.time()
|
|
|
|
# Run test for this group
|
|
correct, tested = run_ruler_niah_test(
|
|
model_path=model_path,
|
|
data_file=data_file,
|
|
sample_indices=sample_indices,
|
|
max_model_len=max_model_len,
|
|
max_new_tokens=max_new_tokens,
|
|
enable_cpu_offload=enable_cpu_offload,
|
|
num_gpu_blocks=num_gpu_blocks,
|
|
block_size=block_size,
|
|
gpu_utilization=gpu_utilization,
|
|
enforce_eager=enforce_eager,
|
|
verbose=True,
|
|
)
|
|
|
|
group_time = time.time() - group_start_time
|
|
|
|
total_correct += correct
|
|
total_tested += tested
|
|
|
|
group_result = {
|
|
"group": group_idx + 1,
|
|
"samples": f"{start_idx}-{end_idx - 1}",
|
|
"correct": correct,
|
|
"total": tested,
|
|
"accuracy": 100 * correct / tested if tested > 0 else 0,
|
|
"time": group_time,
|
|
}
|
|
group_results.append(group_result)
|
|
|
|
print(f"\nGroup {group_idx + 1} Summary: {correct}/{tested} PASSED ({group_result['accuracy']:.1f}%) in {group_time:.1f}s")
|
|
|
|
# Force cleanup between groups
|
|
gc.collect()
|
|
torch.cuda.empty_cache()
|
|
|
|
# Small delay to ensure port is released
|
|
if group_idx < num_groups - 1:
|
|
time.sleep(3)
|
|
|
|
total_time = time.time() - test_start_time
|
|
|
|
# Final summary
|
|
print(f"\n{'='*60}")
|
|
print(f"FINAL SUMMARY")
|
|
print(f"{'='*60}")
|
|
print(f"\nGroup Results:")
|
|
print(f"{'Group':<8} {'Samples':<12} {'Result':<12} {'Accuracy':<10} {'Time':<10}")
|
|
print(f"{'-'*52}")
|
|
for r in group_results:
|
|
print(f"{r['group']:<8} {r['samples']:<12} {r['correct']}/{r['total']:<9} {r['accuracy']:.1f}%{'':<5} {r['time']:.1f}s")
|
|
|
|
print(f"{'-'*52}")
|
|
overall_accuracy = 100 * total_correct / total_tested if total_tested > 0 else 0
|
|
print(f"{'TOTAL':<8} {'0-' + str(total_tested-1):<12} {total_correct}/{total_tested:<9} {overall_accuracy:.1f}%{'':<5} {total_time:.1f}s")
|
|
print(f"{'='*60}\n")
|
|
|
|
return total_correct, total_tested, group_results
|
|
|
|
|
|
# ============================================================
|
|
# CLI Entry Point
|
|
# ============================================================
|
|
|
|
def parse_indices(s: str) -> List[int]:
|
|
"""Parse comma-separated indices like '0,1,2' or range like '0-4'."""
|
|
if not s:
|
|
return None
|
|
indices = []
|
|
for part in s.split(','):
|
|
if '-' in part:
|
|
start, end = part.split('-')
|
|
indices.extend(range(int(start), int(end) + 1))
|
|
else:
|
|
indices.append(int(part))
|
|
return indices
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="RULER NIAH benchmark test for long context LLM",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Test all samples with CPU offload (recommended for 24GB GPUs)
|
|
python tests/test_ruler_niah.py --enable-offload
|
|
|
|
# Test specific samples
|
|
python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
|
|
|
|
# Test with CUDA graph enabled
|
|
python tests/test_ruler_niah.py --enable-offload --use-cuda-graph
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--model", "-m",
|
|
type=str,
|
|
default=DEFAULT_MODEL,
|
|
help=f"Path to model (default: {DEFAULT_MODEL})"
|
|
)
|
|
parser.add_argument(
|
|
"--data-file",
|
|
type=str,
|
|
default=str(DEFAULT_DATA_FILE),
|
|
help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})"
|
|
)
|
|
parser.add_argument(
|
|
"--sample-indices",
|
|
type=str,
|
|
default="",
|
|
help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all"
|
|
)
|
|
parser.add_argument(
|
|
"--max-model-len",
|
|
type=int,
|
|
default=DEFAULT_MAX_MODEL_LEN,
|
|
help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})"
|
|
)
|
|
parser.add_argument(
|
|
"--max-new-tokens",
|
|
type=int,
|
|
default=DEFAULT_MAX_NEW_TOKENS,
|
|
help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})"
|
|
)
|
|
parser.add_argument(
|
|
"--enable-offload",
|
|
action="store_true",
|
|
help="Enable CPU offload mode (required for 24GB GPUs with 32K context)"
|
|
)
|
|
parser.add_argument(
|
|
"--num-gpu-blocks",
|
|
type=int,
|
|
default=4,
|
|
help="Number of GPU blocks for CPU offload (default: 4)"
|
|
)
|
|
parser.add_argument(
|
|
"--block-size",
|
|
type=int,
|
|
default=1024,
|
|
help="KV cache block size (default: 1024)"
|
|
)
|
|
parser.add_argument(
|
|
"--gpu-utilization",
|
|
type=float,
|
|
default=0.9,
|
|
help="GPU memory utilization fraction (default: 0.9)"
|
|
)
|
|
parser.add_argument(
|
|
"--enforce-eager",
|
|
action="store_true",
|
|
default=True,
|
|
help="Force eager execution, disable CUDA graphs (default: True)"
|
|
)
|
|
parser.add_argument(
|
|
"--use-cuda-graph",
|
|
action="store_true",
|
|
help="Enable CUDA graph (overrides --enforce-eager)"
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
action="store_true",
|
|
default=True,
|
|
help="Print detailed output (default: True)"
|
|
)
|
|
parser.add_argument(
|
|
"--quiet", "-q",
|
|
action="store_true",
|
|
help="Quiet mode, only print final result"
|
|
)
|
|
parser.add_argument(
|
|
"--group-size",
|
|
type=int,
|
|
default=0,
|
|
help="Enable grouped testing mode with specified group size. Each group initializes LLM separately. (default: 0 = disabled)"
|
|
)
|
|
parser.add_argument(
|
|
"--total-samples",
|
|
type=int,
|
|
default=0,
|
|
help="Total number of samples to test in group mode (default: 0 = all samples in file)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Process arguments
|
|
sample_indices = parse_indices(args.sample_indices)
|
|
enforce_eager = not args.use_cuda_graph
|
|
verbose = not args.quiet
|
|
|
|
# Check if group mode is enabled
|
|
if args.group_size > 0:
|
|
# Grouped testing mode
|
|
total_samples = args.total_samples if args.total_samples > 0 else None
|
|
correct, total, _ = run_grouped_test(
|
|
model_path=os.path.expanduser(args.model),
|
|
data_file=Path(args.data_file),
|
|
group_size=args.group_size,
|
|
total_samples=total_samples,
|
|
max_model_len=args.max_model_len,
|
|
max_new_tokens=args.max_new_tokens,
|
|
enable_cpu_offload=args.enable_offload,
|
|
num_gpu_blocks=args.num_gpu_blocks,
|
|
block_size=args.block_size,
|
|
gpu_utilization=args.gpu_utilization,
|
|
enforce_eager=enforce_eager,
|
|
)
|
|
else:
|
|
# Standard testing mode
|
|
correct, total = run_ruler_niah_test(
|
|
model_path=os.path.expanduser(args.model),
|
|
data_file=Path(args.data_file),
|
|
sample_indices=sample_indices,
|
|
max_model_len=args.max_model_len,
|
|
max_new_tokens=args.max_new_tokens,
|
|
enable_cpu_offload=args.enable_offload,
|
|
num_gpu_blocks=args.num_gpu_blocks,
|
|
block_size=args.block_size,
|
|
gpu_utilization=args.gpu_utilization,
|
|
enforce_eager=enforce_eager,
|
|
verbose=verbose,
|
|
)
|
|
|
|
# Final status
|
|
if correct == total:
|
|
print("test_ruler_niah: PASSED")
|
|
else:
|
|
print(f"test_ruler_niah: FAILED ({correct}/{total})")
|
|
exit(1)
|