diff --git a/tests/test_needle.py b/tests/test_needle.py deleted file mode 100644 index 92f707e..0000000 --- a/tests/test_needle.py +++ /dev/null @@ -1,254 +0,0 @@ -""" -Needle-in-a-haystack test for LLM. - -Tests: Long context retrieval capability with configurable sequence length. - -NOTE: CPU offload mode has a known bug that causes incorrect outputs for -sequences longer than ~200 tokens. Use --no-offload for correctness testing. -""" - -import os -os.environ["NANOVLLM_LOG_LEVEL"] = "INFO" - -import argparse -from nanovllm import LLM, SamplingParams -from nanovllm.config import SparsePolicyType -from utils import generate_needle_prompt, check_needle_answer - - -# ============================================================ -# Main Test -# ============================================================ - -def run_needle_test( - model_path: str, - max_model_len: int, - input_len: int, - num_gpu_blocks: int = 4, - block_size: int = 1024, - needle_position: float = 0.5, - needle_value: str = "7492", - max_new_tokens: int = 32, - enable_cpu_offload: bool = False, - enable_quest: bool = False, - enable_xattn_bsa: bool = False, - sparse_topk: int = 8, - sparse_threshold: int = 4, - sparse_samples: int = 128, - verbose: bool = True, -) -> bool: - """ - Run a needle-in-haystack test. - - Args: - model_path: Path to model - max_model_len: Maximum model context length - input_len: Target input sequence length - num_gpu_blocks: Number of GPU blocks for offload - block_size: KV cache block size - needle_position: Where to place needle (0.0-1.0) - needle_value: The secret value to find - max_new_tokens: Maximum tokens to generate - enable_cpu_offload: Enable CPU offload mode - enable_quest: Enable Quest sparse attention (decode-only Top-K) - enable_xattn_bsa: Enable XAttention BSA sparse attention (prefill-only) - sparse_topk: Top-K blocks for Quest - sparse_threshold: Threshold for sparse selection (Quest/XAttention BSA) - sparse_samples: Samples per chunk for XAttention BSA estimation - verbose: Print detailed output - - Returns: - True if test passed, False otherwise - """ - # Determine sparse policy - if enable_xattn_bsa: - sparse_policy = SparsePolicyType.XATTN_BSA - elif enable_quest: - sparse_policy = SparsePolicyType.QUEST - else: - sparse_policy = SparsePolicyType.FULL - - if verbose: - print(f"\n{'='*60}") - print(f"Needle-in-Haystack Test") - print(f"{'='*60}") - print(f"Model: {model_path}") - print(f"Max model len: {max_model_len}") - print(f"Input length: {input_len}") - print(f"Block size: {block_size}") - print(f"Needle position: {needle_position:.0%}") - print(f"Needle value: {needle_value}") - print(f"CPU offload: {enable_cpu_offload}") - if enable_cpu_offload: - print(f"Sparse policy: {sparse_policy.name}") - if sparse_policy == SparsePolicyType.QUEST: - print(f" Quest: topk={sparse_topk}, threshold={sparse_threshold}") - elif sparse_policy == SparsePolicyType.XATTN_BSA: - print(f" XAttention BSA: threshold={sparse_threshold}, samples={sparse_samples}") - print(f"{'='*60}\n") - - # 1. Initialize LLM - llm_kwargs = { - "enforce_eager": True, - "max_model_len": max_model_len, - "max_num_batched_tokens": max_model_len, - "enable_cpu_offload": enable_cpu_offload, - "kvcache_block_size": block_size, - } - if enable_cpu_offload: - llm_kwargs["num_gpu_blocks"] = num_gpu_blocks - llm_kwargs["sparse_policy"] = sparse_policy - if sparse_policy == SparsePolicyType.QUEST: - llm_kwargs["sparse_topk_blocks"] = sparse_topk - llm_kwargs["sparse_threshold_blocks"] = sparse_threshold - elif sparse_policy == SparsePolicyType.XATTN_BSA: - llm_kwargs["sparse_threshold"] = float(sparse_threshold) / 10.0 # Convert to 0.0-1.0 range - llm_kwargs["sparse_samples_per_chunk"] = sparse_samples - - llm = LLM(model_path, **llm_kwargs) - - # 2. Generate needle prompt - prompt, expected = generate_needle_prompt( - tokenizer=llm.tokenizer, - target_length=input_len, - needle_position=needle_position, - needle_value=needle_value, - ) - - # 3. Generate output - sampling_params = SamplingParams( - temperature=0.6, # Moderate temperature - max_tokens=max_new_tokens, - ) - outputs = llm.generate([prompt], sampling_params, use_tqdm=True) - - # 4. Check result - output_text = outputs[0]["text"] - output_token_ids = outputs[0]["token_ids"] - passed = check_needle_answer(output_text, expected) - - if verbose: - print(f"\n{'='*60}") - print(f"Result") - print(f"{'='*60}") - print(f"Expected: {expected}") - print(f"Output tokens ({len(output_token_ids)}): {output_token_ids[:20]}") - print(f"Output: {output_text[:200]}...") - print(f"Status: {'PASSED' if passed else 'FAILED'}") - print(f"{'='*60}\n") - - return passed - - -# ============================================================ -# CLI Entry Point -# ============================================================ - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Needle-in-haystack test for long context LLM") - parser.add_argument( - "--model", "-m", - type=str, - default=os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/"), - help="Path to model" - ) - parser.add_argument( - "--max-model-len", - type=int, - default=128 * 1024, - help="Maximum model context length" - ) - parser.add_argument( - "--input-len", - type=int, - default=8 * 1024, - help="Target input sequence length" - ) - parser.add_argument( - "--num-gpu-blocks", - type=int, - default=2, - help="Number of GPU blocks for CPU offload" - ) - parser.add_argument( - "--block-size", - type=int, - default=1024, - help="KV cache block size" - ) - parser.add_argument( - "--needle-position", - type=float, - default=0.5, - help="Needle position (0.0=start, 0.5=middle, 1.0=end)" - ) - parser.add_argument( - "--needle-value", - type=str, - default="7492", - help="The secret value to hide" - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=32, - help="Maximum tokens to generate" - ) - parser.add_argument( - "--enable-offload", - action="store_true", - help="Enable CPU offload (has known bug for long sequences)" - ) - parser.add_argument( - "--enable-quest", - action="store_true", - help="Enable Quest sparse attention (decode-only Top-K selection)" - ) - parser.add_argument( - "--enable-xattn-bsa", - action="store_true", - help="Enable XAttention BSA sparse attention (prefill-only)" - ) - parser.add_argument( - "--sparse-topk", - type=int, - default=8, - help="Top-K blocks for Quest sparse attention" - ) - parser.add_argument( - "--sparse-threshold", - type=int, - default=4, - help="Apply sparse only when blocks > threshold (Quest) or attention threshold 0-9 (XAttention BSA)" - ) - parser.add_argument( - "--sparse-samples", - type=int, - default=128, - help="Samples per chunk for XAttention BSA estimation" - ) - args = parser.parse_args() - - passed = run_needle_test( - model_path=args.model, - max_model_len=args.max_model_len, - input_len=args.input_len, - num_gpu_blocks=args.num_gpu_blocks, - block_size=args.block_size, - needle_position=args.needle_position, - needle_value=args.needle_value, - max_new_tokens=args.max_new_tokens, - enable_cpu_offload=args.enable_offload, - enable_quest=args.enable_quest, - enable_xattn_bsa=args.enable_xattn_bsa, - sparse_topk=args.sparse_topk, - sparse_threshold=args.sparse_threshold, - sparse_samples=args.sparse_samples, - verbose=True, - ) - - if passed: - print("test_needle: PASSED") - else: - print("test_needle: FAILED") - exit(1) diff --git a/tests/test_needle_ref.py b/tests/test_needle_ref.py deleted file mode 100644 index 8431e2d..0000000 --- a/tests/test_needle_ref.py +++ /dev/null @@ -1,176 +0,0 @@ -""" -Needle-in-a-haystack reference test using pure torch + transformers. - -This is a reference implementation for comparison with nanovllm. -Uses standard HuggingFace inference (no custom KV cache, no offload). -""" - -import os -import argparse -import torch -from transformers import AutoTokenizer -from modeling_qwen3 import Qwen3ForCausalLM -from utils import generate_needle_prompt, check_needle_answer - - -# ============================================================ -# Main Test -# ============================================================ - -def run_needle_test( - model_path: str, - input_len: int, - needle_position: float = 0.5, - needle_value: str = "7492", - max_new_tokens: int = 32, - dtype: str = "auto", - verbose: bool = True, -) -> bool: - """ - Run a needle-in-haystack test using standard transformers inference. - - Args: - model_path: Path to model - input_len: Target input sequence length - needle_position: Where to place needle (0.0-1.0) - needle_value: The secret value to find - max_new_tokens: Maximum tokens to generate - dtype: Model dtype ("auto", "float16", "bfloat16") - verbose: Print detailed output - - Returns: - True if test passed, False otherwise - """ - if verbose: - print(f"\n{'='*60}") - print(f"Needle-in-Haystack Reference Test (torch + transformers)") - print(f"{'='*60}") - print(f"Model: {model_path}") - print(f"Input length: {input_len}") - print(f"Needle position: {needle_position:.0%}") - print(f"Needle value: {needle_value}") - print(f"Dtype: {dtype}") - print(f"{'='*60}\n") - - # 1. Load tokenizer - print("[1/4] Loading tokenizer...") - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - - # 2. Generate needle prompt - print("[2/4] Generating needle prompt...") - prompt, expected = generate_needle_prompt( - tokenizer=tokenizer, - target_length=input_len, - needle_position=needle_position, - needle_value=needle_value, - ) - - # 3. Load model - print("[3/4] Loading model...") - torch_dtype = { - "auto": torch.float16, # default to float16 for custom model - "float16": torch.float16, - "bfloat16": torch.bfloat16, - }.get(dtype, torch.float16) - - model = Qwen3ForCausalLM.from_pretrained(model_path, dtype=torch_dtype) - model = model.to("cuda" if torch.cuda.is_available() else "cpu") - model.eval() - - # 4. Generate output - print("[4/4] Running inference...") - device = next(model.parameters()).device - input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) - print(f" Input shape: {input_ids.shape}") - - with torch.no_grad(): - output_ids = model.generate( - input_ids, - max_new_tokens=max_new_tokens, - temperature=0.6, - do_sample=True, - pad_token_id=tokenizer.eos_token_id, - ) - - # Decode only the new tokens - new_token_ids = output_ids[0, input_ids.shape[1]:] - output_text = tokenizer.decode(new_token_ids, skip_special_tokens=False) - - # 5. Check result - passed = check_needle_answer(output_text, expected) - - if verbose: - print(f"\n{'='*60}") - print(f"Result") - print(f"{'='*60}") - print(f"Expected: {expected}") - print(f"Output tokens ({len(new_token_ids)}): {new_token_ids[:20].tolist()}") - print(f"Output: {output_text[:200]}...") - print(f"Status: {'PASSED' if passed else 'FAILED'}") - print(f"{'='*60}\n") - - return passed - - -# ============================================================ -# CLI Entry Point -# ============================================================ - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Needle-in-haystack reference test (torch + transformers)" - ) - parser.add_argument( - "--model", "-m", - type=str, - default=os.path.expanduser("~/models/Qwen3-4B-Instruct-2507/"), - help="Path to model" - ) - parser.add_argument( - "--input-len", - type=int, - default=8 * 1024, - help="Target input sequence length" - ) - parser.add_argument( - "--needle-position", - type=float, - default=0.5, - help="Needle position (0.0=start, 0.5=middle, 1.0=end)" - ) - parser.add_argument( - "--needle-value", - type=str, - default="7492", - help="The secret value to hide" - ) - parser.add_argument( - "--max-new-tokens", - type=int, - default=32, - help="Maximum tokens to generate" - ) - parser.add_argument( - "--dtype", - type=str, - default="auto", - choices=["auto", "float16", "bfloat16"], - help="Model dtype" - ) - args = parser.parse_args() - - passed = run_needle_test( - model_path=args.model, - input_len=args.input_len, - needle_position=args.needle_position, - needle_value=args.needle_value, - max_new_tokens=args.max_new_tokens, - dtype=args.dtype, - verbose=True, - ) - - if passed: - print("test_needle_ref: PASSED") - else: - print("test_needle_ref: FAILED") - exit(1)