Files
nano-vllm/tests/test_prefill.py
2025-12-24 18:22:26 +08:00

52 lines
1.4 KiB
Python

"""
Test script for chunked prefill with CPU offload.
Demonstrates: LLM initialization, prefill execution with CPU offload enabled.
"""
import os
os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"
from random import randint, seed
from nanovllm import LLM, SamplingParams
# ============================================================
# Configuration
# ============================================================
MODEL_PATH = os.path.expanduser("~/models/Qwen3-0.6B/")
MAX_MODEL_LEN = 32 * 1024
NUM_GPU_BLOCKS = 2
INPUT_LEN = 16 * 1024
# ============================================================
# Main Test Script
# ============================================================
# 1. Initialize LLM with CPU offload
llm = LLM(
MODEL_PATH,
enforce_eager=True,
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=MAX_MODEL_LEN,
enable_cpu_offload=True,
kvcache_block_size=1024,
num_gpu_blocks=NUM_GPU_BLOCKS,
)
# 2. Generate random prompt tokens
seed(42)
prompt_token_ids = [[randint(0, 10000) for _ in range(INPUT_LEN)]]
# 3. Run prefill (max_tokens=1 to focus on prefill only)
sampling_params = SamplingParams(temperature=0.6, ignore_eos=True, max_tokens=1)
outputs = llm.generate(prompt_token_ids, sampling_params, use_tqdm=False)
# 4. Verify output
assert len(outputs) == 1
assert "token_ids" in outputs[0]
assert len(outputs[0]["token_ids"]) == 1
print("test_prefill: PASSED")