[tests] Added test_niah_standalone.py.
This commit is contained in:
297
docs/ruler_niah_standalone_test.md
Normal file
297
docs/ruler_niah_standalone_test.md
Normal file
@@ -0,0 +1,297 @@
|
||||
# RULER NIAH Standalone Test Plan
|
||||
|
||||
## Overview
|
||||
|
||||
This document describes how to independently test nano-vllm's CPU offload functionality using RULER benchmark's NIAH (Needle-In-A-Haystack) task data.
|
||||
|
||||
## Background
|
||||
|
||||
### Problem Being Investigated
|
||||
|
||||
When running 32K sequence length tests with CPU offload mode, the model outputs garbled text instead of finding the magic number. This issue was traced to:
|
||||
|
||||
- **Root Cause**: Ring buffer `max_seq_len` was set equal to `max_model_len` (32768)
|
||||
- **Issue**: When prefill uses ~32K tokens, decode needs to store KV at position 32768+, but ring buffer only has indices 0-32767
|
||||
- **Fix Applied**: In `nanovllm/kvcache/__init__.py`, changed `max_seq_len = max_model_len + 512`
|
||||
|
||||
### Test Objective
|
||||
|
||||
Verify that the fix works correctly by running a standalone test with actual RULER NIAH data.
|
||||
|
||||
## Step 1: Copy Test Data
|
||||
|
||||
### Source Location
|
||||
|
||||
```
|
||||
/home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl
|
||||
```
|
||||
|
||||
### Data Format
|
||||
|
||||
Each line is a JSON object:
|
||||
|
||||
```json
|
||||
{
|
||||
"index": 0,
|
||||
"input": "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nA special magic number is hidden within the following text...",
|
||||
"outputs": ["8930103"],
|
||||
"length": 32768
|
||||
}
|
||||
```
|
||||
|
||||
- `input`: Full prompt with Llama 3.1 chat template (~122K characters, ~30K tokens)
|
||||
- `outputs`: Expected answer (the magic number to find)
|
||||
- `length`: Target sequence length in tokens
|
||||
|
||||
### Copy Command
|
||||
|
||||
```bash
|
||||
mkdir -p /home/zijie/Code/nano-vllm/tests/data/ruler_niah
|
||||
cp /home/zijie/Code/x-attention/eval/RULER/scripts/benchmark_root/full_fuse_16_llama3.1-8b-chat/synthetic/32768/data/niah_single_1/validation.jsonl \
|
||||
/home/zijie/Code/nano-vllm/tests/data/ruler_niah/niah_single_1_32k.jsonl
|
||||
```
|
||||
|
||||
## Step 2: Create Test Script
|
||||
|
||||
Create `/home/zijie/Code/nano-vllm/tests/test_ruler_niah_32k.py`:
|
||||
|
||||
```python
|
||||
"""
|
||||
Standalone test for RULER NIAH task with 32K context length.
|
||||
|
||||
This test verifies that CPU offload mode correctly handles long sequences
|
||||
where prefill tokens approach max_model_len.
|
||||
|
||||
Usage:
|
||||
python tests/test_ruler_niah_32k.py
|
||||
"""
|
||||
|
||||
import json
|
||||
import torch
|
||||
from pathlib import Path
|
||||
|
||||
from nanovllm import LLM
|
||||
from nanovllm.config import SamplingParams
|
||||
|
||||
# Configuration
|
||||
MODEL_PATH = "/data/models/Llama-3.1-8B-Instruct"
|
||||
DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
|
||||
MAX_MODEL_LEN = 32768
|
||||
MAX_NEW_TOKENS = 50
|
||||
|
||||
# CPU Offload Settings
|
||||
ENABLE_CPU_OFFLOAD = True
|
||||
NUM_GPU_BLOCKS = 4
|
||||
BLOCK_SIZE = 1024
|
||||
|
||||
|
||||
def load_test_sample(filepath: Path, index: int = 0) -> dict:
|
||||
"""Load a single test sample from JSONL file."""
|
||||
with open(filepath) as f:
|
||||
for i, line in enumerate(f):
|
||||
if i == index:
|
||||
return json.loads(line)
|
||||
raise ValueError(f"Sample index {index} not found")
|
||||
|
||||
|
||||
def test_niah_single():
|
||||
"""Test NIAH single needle task with 32K context."""
|
||||
print("=" * 60)
|
||||
print("RULER NIAH 32K Standalone Test")
|
||||
print("=" * 60)
|
||||
|
||||
# Load test data
|
||||
sample = load_test_sample(DATA_FILE, index=0)
|
||||
prompt = sample["input"]
|
||||
expected = sample["outputs"][0]
|
||||
|
||||
print(f"Prompt length: {len(prompt)} characters")
|
||||
print(f"Expected answer: {expected}")
|
||||
print()
|
||||
|
||||
# Initialize model with CPU offload
|
||||
print("Initializing LLM with CPU offload...")
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
enable_cpu_offload=ENABLE_CPU_OFFLOAD,
|
||||
num_gpu_blocks=NUM_GPU_BLOCKS,
|
||||
kvcache_block_size=BLOCK_SIZE,
|
||||
enforce_eager=True, # Disable CUDA graphs for debugging
|
||||
)
|
||||
|
||||
# Generate
|
||||
print("Generating response...")
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0, # Greedy
|
||||
max_tokens=MAX_NEW_TOKENS,
|
||||
)
|
||||
|
||||
outputs = llm.generate([prompt], sampling_params)
|
||||
generated_text = outputs[0].outputs[0].text
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("Results")
|
||||
print("=" * 60)
|
||||
print(f"Expected: {expected}")
|
||||
print(f"Generated: {generated_text[:200]}...")
|
||||
print()
|
||||
|
||||
# Check if expected number is in output
|
||||
if expected in generated_text:
|
||||
print("SUCCESS: Magic number found in output!")
|
||||
return True
|
||||
else:
|
||||
print("FAILED: Magic number NOT found in output")
|
||||
print(f"Full output: {generated_text}")
|
||||
return False
|
||||
|
||||
|
||||
def test_multiple_samples(num_samples: int = 5):
|
||||
"""Test multiple NIAH samples."""
|
||||
print("=" * 60)
|
||||
print(f"Testing {num_samples} NIAH samples with 32K context")
|
||||
print("=" * 60)
|
||||
|
||||
# Initialize model once
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
enable_cpu_offload=ENABLE_CPU_OFFLOAD,
|
||||
num_gpu_blocks=NUM_GPU_BLOCKS,
|
||||
kvcache_block_size=BLOCK_SIZE,
|
||||
enforce_eager=True,
|
||||
)
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.0,
|
||||
max_tokens=MAX_NEW_TOKENS,
|
||||
)
|
||||
|
||||
correct = 0
|
||||
for i in range(num_samples):
|
||||
sample = load_test_sample(DATA_FILE, index=i)
|
||||
prompt = sample["input"]
|
||||
expected = sample["outputs"][0]
|
||||
|
||||
outputs = llm.generate([prompt], sampling_params)
|
||||
generated_text = outputs[0].outputs[0].text
|
||||
|
||||
if expected in generated_text:
|
||||
print(f"Sample {i}: PASS (found {expected})")
|
||||
correct += 1
|
||||
else:
|
||||
print(f"Sample {i}: FAIL (expected {expected}, got: {generated_text[:50]}...)")
|
||||
|
||||
print()
|
||||
print(f"Accuracy: {correct}/{num_samples} ({100*correct/num_samples:.1f}%)")
|
||||
return correct == num_samples
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "--all":
|
||||
success = test_multiple_samples(5)
|
||||
else:
|
||||
success = test_niah_single()
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
```
|
||||
|
||||
## Step 3: Run Test
|
||||
|
||||
### Single Sample Test
|
||||
|
||||
```bash
|
||||
cd /home/zijie/Code/nano-vllm
|
||||
CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py
|
||||
```
|
||||
|
||||
### All 5 Samples
|
||||
|
||||
```bash
|
||||
cd /home/zijie/Code/nano-vllm
|
||||
CUDA_VISIBLE_DEVICES=2,3,4,5 python tests/test_ruler_niah_32k.py --all
|
||||
```
|
||||
|
||||
## Step 4: Expected Results
|
||||
|
||||
### Before Fix (Bug)
|
||||
|
||||
- Output: Garbled text like "not only has been replaced by thesiums..."
|
||||
- Score: 0% (magic number not found)
|
||||
- Time: ~80 seconds per sample
|
||||
|
||||
### After Fix (Expected)
|
||||
|
||||
- Output: The magic number (e.g., "8930103")
|
||||
- Score: ~100% (magic number found)
|
||||
- Time: ~80 seconds per sample (same, as the compute is unchanged)
|
||||
|
||||
## Debugging Tips
|
||||
|
||||
### Enable Verbose Logging
|
||||
|
||||
```python
|
||||
import logging
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
```
|
||||
|
||||
### Check Ring Buffer Size
|
||||
|
||||
In the logs, verify:
|
||||
```
|
||||
OffloadEngine initializing: num_layers=32, num_kv_buffers=4, max_seq_len=33280
|
||||
```
|
||||
|
||||
The `max_seq_len` should be `32768 + 512 = 33280` (not 32768).
|
||||
|
||||
### Monitor GPU Memory
|
||||
|
||||
```bash
|
||||
watch -n 1 nvidia-smi
|
||||
```
|
||||
|
||||
With CPU offload, GPU memory for KV cache should be ~640MB (ring buffer only).
|
||||
|
||||
## Related Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `nanovllm/kvcache/__init__.py` | Fix location: `max_seq_len = max_model_len + 512` |
|
||||
| `nanovllm/kvcache/offload_engine.py` | Ring buffer allocation |
|
||||
| `nanovllm/engine/model_runner.py` | Layer-wise offload prefill/decode |
|
||||
| `nanovllm/kvcache/hybrid_manager.py` | CPU block management |
|
||||
|
||||
## Test Data Details
|
||||
|
||||
### NIAH Task Description
|
||||
|
||||
The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a specific piece of information (the "needle") from a large context (the "haystack").
|
||||
|
||||
- **Needle**: A magic number associated with a keyword (e.g., "worried-purse")
|
||||
- **Haystack**: ~30K tokens of distractor text
|
||||
- **Task**: Extract the magic number when asked
|
||||
|
||||
### Sample Prompt Structure
|
||||
|
||||
```
|
||||
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
|
||||
|
||||
A special magic number is hidden within the following text. Make sure to memorize it. I will quiz you about the number afterwards.
|
||||
|
||||
[... ~30K tokens of haystack text ...]
|
||||
|
||||
The special magic number for worried-purse is 8930103.
|
||||
|
||||
[... more haystack text ...]
|
||||
|
||||
What is the special magic number for worried-purse mentioned in the provided text?
|
||||
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
|
||||
|
||||
The special magic number for worried-purse mentioned in the provided text is
|
||||
```
|
||||
|
||||
The model should complete with: `8930103`
|
||||
@@ -61,6 +61,15 @@ class Config:
|
||||
self.max_model_len = min(self.max_model_len, self.hf_config.max_position_embeddings)
|
||||
assert self.max_num_batched_tokens >= self.max_model_len
|
||||
|
||||
# CPU offload mode only supports single sequence (layer-wise processing)
|
||||
if self.enable_cpu_offload and self.max_num_seqs != 1:
|
||||
import logging
|
||||
logging.warning(
|
||||
f"CPU offload mode only supports single sequence. "
|
||||
f"Overriding max_num_seqs from {self.max_num_seqs} to 1."
|
||||
)
|
||||
self.max_num_seqs = 1
|
||||
|
||||
# Override torch_dtype if user specified
|
||||
if self.dtype is not None:
|
||||
dtype_map = {
|
||||
|
||||
@@ -27,7 +27,9 @@ class ModelRunner:
|
||||
self.rank = rank
|
||||
self.event = event
|
||||
|
||||
dist.init_process_group("nccl", "tcp://localhost:2333", world_size=self.world_size, rank=rank)
|
||||
import os
|
||||
port = os.environ.get("NANOVLLM_DIST_PORT", "2333")
|
||||
dist.init_process_group("nccl", f"tcp://localhost:{port}", world_size=self.world_size, rank=rank)
|
||||
torch.cuda.set_device(rank)
|
||||
default_dtype = torch.get_default_dtype()
|
||||
torch.set_default_dtype(hf_config.torch_dtype)
|
||||
@@ -546,8 +548,8 @@ class ModelRunner:
|
||||
k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
|
||||
# Q/K norms (Qwen3 specific)
|
||||
if not layer.self_attn.qkv_bias:
|
||||
# Q/K norms (Qwen3 specific - only when qkv_bias=False)
|
||||
if not getattr(layer.self_attn, 'qkv_bias', True):
|
||||
num_tokens = q.shape[0]
|
||||
q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
|
||||
q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
|
||||
@@ -649,8 +651,8 @@ class ModelRunner:
|
||||
k_new = k_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
v_new = v_new.view(1, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
|
||||
# Q/K norms
|
||||
if not layer.self_attn.qkv_bias:
|
||||
# Q/K norms (Qwen3 specific - only when qkv_bias=False)
|
||||
if not getattr(layer.self_attn, 'qkv_bias', True):
|
||||
q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
|
||||
q = q.view(1, layer.self_attn.num_heads, layer.self_attn.head_dim)
|
||||
k_new = layer.self_attn.k_norm(k_new.reshape(-1, layer.self_attn.head_dim))
|
||||
@@ -785,8 +787,8 @@ class ModelRunner:
|
||||
k = k.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
v = v.view(total_tokens, layer.self_attn.num_kv_heads, layer.self_attn.head_dim)
|
||||
|
||||
# Q/K norms (Qwen3 specific)
|
||||
if not layer.self_attn.qkv_bias:
|
||||
# Q/K norms (Qwen3 specific - only when qkv_bias=False)
|
||||
if not getattr(layer.self_attn, 'qkv_bias', True):
|
||||
num_tokens = q.shape[0]
|
||||
q = layer.self_attn.q_norm(q.reshape(-1, layer.self_attn.head_dim))
|
||||
q = q.view(num_tokens, layer.self_attn.num_heads, layer.self_attn.head_dim)
|
||||
|
||||
@@ -71,6 +71,12 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
||||
threshold_blocks=getattr(config, 'sparse_threshold_blocks', 4),
|
||||
)
|
||||
|
||||
# max_seq_len needs to be larger than max_model_len to accommodate decode tokens
|
||||
# When prefill uses ~max_model_len tokens, decode needs additional slots
|
||||
# Add max_new_tokens (default 512) buffer for decode phase
|
||||
max_new_tokens = getattr(config, 'max_new_tokens', 512)
|
||||
max_seq_len = config.max_model_len + max_new_tokens
|
||||
|
||||
return HybridKVCacheManager(
|
||||
num_gpu_slots=num_gpu_blocks,
|
||||
num_cpu_blocks=num_cpu_blocks,
|
||||
@@ -78,7 +84,7 @@ def create_kvcache_manager(config: "Config") -> KVCacheManager:
|
||||
policy=eviction_policy,
|
||||
sparse_policy=sparse_policy,
|
||||
num_kv_buffers=getattr(config, 'num_kv_buffers', 4),
|
||||
max_seq_len=config.max_model_len,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -3,7 +3,13 @@
|
||||
from nanovllm.models.registry import register_model, get_model_class, MODEL_REGISTRY
|
||||
|
||||
# Import models to trigger registration
|
||||
# Qwen3 requires transformers>=4.51.0 for Qwen3Config
|
||||
try:
|
||||
from nanovllm.models import qwen3
|
||||
except ImportError as e:
|
||||
import warnings
|
||||
warnings.warn(f"Qwen3 model not available (requires transformers>=4.51.0): {e}")
|
||||
|
||||
from nanovllm.models import llama
|
||||
|
||||
__all__ = ["register_model", "get_model_class", "MODEL_REGISTRY"]
|
||||
|
||||
357
tests/test_ruler_niah.py
Normal file
357
tests/test_ruler_niah.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""
|
||||
RULER NIAH benchmark test for LLM.
|
||||
|
||||
Tests: Long context retrieval capability using pre-generated RULER benchmark data.
|
||||
The NIAH (Needle-In-A-Haystack) task tests the model's ability to retrieve a
|
||||
specific magic number from a large context (~32K tokens).
|
||||
|
||||
Usage:
|
||||
# Test all samples with CPU offload
|
||||
python tests/test_ruler_niah.py --enable-offload
|
||||
|
||||
# Test specific samples
|
||||
python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
|
||||
|
||||
# Test with custom model
|
||||
python tests/test_ruler_niah.py --model /path/to/model --enable-offload
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["NANOVLLM_LOG_LEVEL"] = "INFO"
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from nanovllm import LLM, SamplingParams
|
||||
from utils import check_needle_answer
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Constants
|
||||
# ============================================================
|
||||
|
||||
DEFAULT_DATA_FILE = Path(__file__).parent / "data/ruler_niah/niah_single_1_32k.jsonl"
|
||||
DEFAULT_MODEL = os.path.expanduser("~/models/Llama-3.1-8B-Instruct")
|
||||
DEFAULT_MAX_MODEL_LEN = 32768
|
||||
DEFAULT_MAX_NEW_TOKENS = 50
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Data Loading
|
||||
# ============================================================
|
||||
|
||||
def load_ruler_samples(filepath: Path, indices: Optional[List[int]] = None) -> List[dict]:
|
||||
"""
|
||||
Load RULER NIAH samples from a JSONL file.
|
||||
|
||||
Args:
|
||||
filepath: Path to the JSONL file
|
||||
indices: Optional list of sample indices to load. If None, load all.
|
||||
|
||||
Returns:
|
||||
List of sample dicts with keys: index, input, outputs, length
|
||||
"""
|
||||
if not filepath.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Data file not found: {filepath}\n"
|
||||
f"Please copy RULER NIAH data to this location. See docs/ruler_niah_standalone_test.md"
|
||||
)
|
||||
|
||||
samples = []
|
||||
with open(filepath) as f:
|
||||
for i, line in enumerate(f):
|
||||
if indices is None or i in indices:
|
||||
sample = json.loads(line)
|
||||
samples.append(sample)
|
||||
|
||||
if not samples:
|
||||
raise ValueError(f"No samples loaded from {filepath}")
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
def count_samples(filepath: Path) -> int:
|
||||
"""Count total samples in JSONL file."""
|
||||
with open(filepath) as f:
|
||||
return sum(1 for _ in f)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Test Function
|
||||
# ============================================================
|
||||
|
||||
def run_ruler_niah_test(
|
||||
model_path: str,
|
||||
data_file: Path,
|
||||
sample_indices: Optional[List[int]] = None,
|
||||
max_model_len: int = DEFAULT_MAX_MODEL_LEN,
|
||||
max_new_tokens: int = DEFAULT_MAX_NEW_TOKENS,
|
||||
enable_cpu_offload: bool = False,
|
||||
num_gpu_blocks: int = 4,
|
||||
block_size: int = 1024,
|
||||
gpu_utilization: float = 0.9,
|
||||
enforce_eager: bool = True,
|
||||
verbose: bool = True,
|
||||
) -> Tuple[int, int]:
|
||||
"""
|
||||
Run RULER NIAH test on loaded samples.
|
||||
|
||||
Args:
|
||||
model_path: Path to the model
|
||||
data_file: Path to JSONL data file
|
||||
sample_indices: List of sample indices to test (None = all)
|
||||
max_model_len: Maximum model context length
|
||||
max_new_tokens: Maximum tokens to generate
|
||||
enable_cpu_offload: Enable CPU offload mode
|
||||
num_gpu_blocks: Number of GPU blocks for offload
|
||||
block_size: KV cache block size
|
||||
gpu_utilization: GPU memory utilization fraction
|
||||
enforce_eager: Disable CUDA graphs
|
||||
verbose: Print detailed output
|
||||
|
||||
Returns:
|
||||
(correct, total): Number of correct and total samples
|
||||
"""
|
||||
# Load samples
|
||||
samples = load_ruler_samples(data_file, sample_indices)
|
||||
total = len(samples)
|
||||
|
||||
if verbose:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"RULER NIAH Test")
|
||||
print(f"{'='*60}")
|
||||
print(f"Model: {model_path}")
|
||||
print(f"Data file: {data_file}")
|
||||
print(f"Samples: {total}")
|
||||
print(f"Max model len: {max_model_len}")
|
||||
print(f"Max new tokens: {max_new_tokens}")
|
||||
print(f"CPU offload: {enable_cpu_offload}")
|
||||
if enable_cpu_offload:
|
||||
print(f" num_gpu_blocks: {num_gpu_blocks}")
|
||||
print(f" block_size: {block_size}")
|
||||
print(f"Enforce eager: {enforce_eager}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Check max_model_len vs data length
|
||||
max_data_len = max(s.get("length", 0) for s in samples)
|
||||
if max_model_len < max_data_len:
|
||||
print(f"WARNING: max_model_len ({max_model_len}) < max data length ({max_data_len})")
|
||||
print(f" This may cause truncation or errors.\n")
|
||||
|
||||
# Initialize LLM
|
||||
if verbose:
|
||||
print("Initializing LLM...")
|
||||
|
||||
llm_kwargs = {
|
||||
"max_model_len": max_model_len,
|
||||
"max_num_batched_tokens": max_model_len,
|
||||
"enforce_eager": enforce_eager,
|
||||
"gpu_memory_utilization": gpu_utilization,
|
||||
"kvcache_block_size": block_size,
|
||||
"enable_cpu_offload": enable_cpu_offload,
|
||||
}
|
||||
|
||||
if enable_cpu_offload:
|
||||
llm_kwargs["num_gpu_blocks"] = num_gpu_blocks
|
||||
|
||||
llm = LLM(model_path, **llm_kwargs)
|
||||
|
||||
# Sampling params
|
||||
# Note: nano-vllm doesn't support greedy (temperature=0), use low temperature instead
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.1, # Low temperature for near-deterministic output
|
||||
max_tokens=max_new_tokens,
|
||||
)
|
||||
|
||||
# Test each sample
|
||||
correct = 0
|
||||
results = []
|
||||
|
||||
for i, sample in enumerate(samples):
|
||||
sample_idx = sample.get("index", i)
|
||||
prompt = sample["input"]
|
||||
expected = sample["outputs"][0]
|
||||
data_len = sample.get("length", "unknown")
|
||||
|
||||
if verbose:
|
||||
print(f"\nSample {sample_idx}: Expected={expected}, Length={data_len}")
|
||||
|
||||
# Generate
|
||||
outputs = llm.generate([prompt], sampling_params, use_tqdm=False)
|
||||
output_text = outputs[0]["text"]
|
||||
output_tokens = outputs[0]["token_ids"]
|
||||
|
||||
# Check result
|
||||
passed = check_needle_answer(output_text, expected)
|
||||
if passed:
|
||||
correct += 1
|
||||
|
||||
results.append({
|
||||
"index": sample_idx,
|
||||
"expected": expected,
|
||||
"output": output_text,
|
||||
"passed": passed,
|
||||
})
|
||||
|
||||
if verbose:
|
||||
status = "PASS" if passed else "FAIL"
|
||||
output_preview = output_text[:100].replace('\n', ' ')
|
||||
print(f" Output ({len(output_tokens)} tokens): {output_preview}...")
|
||||
print(f" Status: {status}")
|
||||
|
||||
# Summary
|
||||
if verbose:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Results: {correct}/{total} PASSED ({100*correct/total:.1f}%)")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
if correct < total:
|
||||
print("Failed samples:")
|
||||
for r in results:
|
||||
if not r["passed"]:
|
||||
print(f" Sample {r['index']}: expected={r['expected']}, got={r['output'][:50]}...")
|
||||
|
||||
return correct, total
|
||||
|
||||
|
||||
# ============================================================
|
||||
# CLI Entry Point
|
||||
# ============================================================
|
||||
|
||||
def parse_indices(s: str) -> List[int]:
|
||||
"""Parse comma-separated indices like '0,1,2' or range like '0-4'."""
|
||||
if not s:
|
||||
return None
|
||||
indices = []
|
||||
for part in s.split(','):
|
||||
if '-' in part:
|
||||
start, end = part.split('-')
|
||||
indices.extend(range(int(start), int(end) + 1))
|
||||
else:
|
||||
indices.append(int(part))
|
||||
return indices
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="RULER NIAH benchmark test for long context LLM",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Test all samples with CPU offload (recommended for 24GB GPUs)
|
||||
python tests/test_ruler_niah.py --enable-offload
|
||||
|
||||
# Test specific samples
|
||||
python tests/test_ruler_niah.py --sample-indices 0,1,2 --enable-offload
|
||||
|
||||
# Test with CUDA graph enabled
|
||||
python tests/test_ruler_niah.py --enable-offload --use-cuda-graph
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--model", "-m",
|
||||
type=str,
|
||||
default=DEFAULT_MODEL,
|
||||
help=f"Path to model (default: {DEFAULT_MODEL})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-file",
|
||||
type=str,
|
||||
default=str(DEFAULT_DATA_FILE),
|
||||
help=f"Path to JSONL data file (default: {DEFAULT_DATA_FILE})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sample-indices",
|
||||
type=str,
|
||||
default="",
|
||||
help="Sample indices to test (e.g., '0,1,2' or '0-4'). Default: all"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-model-len",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_MODEL_LEN,
|
||||
help=f"Maximum model context length (default: {DEFAULT_MAX_MODEL_LEN})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-new-tokens",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_NEW_TOKENS,
|
||||
help=f"Maximum tokens to generate (default: {DEFAULT_MAX_NEW_TOKENS})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-offload",
|
||||
action="store_true",
|
||||
help="Enable CPU offload mode (required for 24GB GPUs with 32K context)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-gpu-blocks",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of GPU blocks for CPU offload (default: 4)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--block-size",
|
||||
type=int,
|
||||
default=1024,
|
||||
help="KV cache block size (default: 1024)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gpu-utilization",
|
||||
type=float,
|
||||
default=0.9,
|
||||
help="GPU memory utilization fraction (default: 0.9)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enforce-eager",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Force eager execution, disable CUDA graphs (default: True)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-cuda-graph",
|
||||
action="store_true",
|
||||
help="Enable CUDA graph (overrides --enforce-eager)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=True,
|
||||
help="Print detailed output (default: True)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--quiet", "-q",
|
||||
action="store_true",
|
||||
help="Quiet mode, only print final result"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Process arguments
|
||||
sample_indices = parse_indices(args.sample_indices)
|
||||
enforce_eager = not args.use_cuda_graph
|
||||
verbose = not args.quiet
|
||||
|
||||
# Run test
|
||||
correct, total = run_ruler_niah_test(
|
||||
model_path=os.path.expanduser(args.model),
|
||||
data_file=Path(args.data_file),
|
||||
sample_indices=sample_indices,
|
||||
max_model_len=args.max_model_len,
|
||||
max_new_tokens=args.max_new_tokens,
|
||||
enable_cpu_offload=args.enable_offload,
|
||||
num_gpu_blocks=args.num_gpu_blocks,
|
||||
block_size=args.block_size,
|
||||
gpu_utilization=args.gpu_utilization,
|
||||
enforce_eager=enforce_eager,
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
# Final status
|
||||
if correct == total:
|
||||
print("test_ruler_niah: PASSED")
|
||||
else:
|
||||
print(f"test_ruler_niah: FAILED ({correct}/{total})")
|
||||
exit(1)
|
||||
Reference in New Issue
Block a user