[WIP] remove num_prefetch_blocks varible.

This commit is contained in:
Zijie Tian
2025-12-24 18:22:26 +08:00
parent b264de903d
commit 782437c486
10 changed files with 465 additions and 18 deletions

View File

@@ -5,17 +5,20 @@ Demonstrates: LLM initialization, prefill execution with CPU offload enabled.
"""
import os
os.environ["NANOVLLM_LOG_LEVEL"] = "DEBUG"
from random import randint, seed
from nanovllm import LLM, SamplingParams
# ============================================================
# Configuration
# ============================================================
MODEL_PATH = os.path.expanduser("~/models/Qwen3-0.6B/")
MAX_MODEL_LEN = 8192
NUM_GPU_BLOCKS = 4
INPUT_LEN = 4096
MAX_MODEL_LEN = 32 * 1024
NUM_GPU_BLOCKS = 2
INPUT_LEN = 16 * 1024
# ============================================================
# Main Test Script
@@ -28,6 +31,7 @@ llm = LLM(
max_model_len=MAX_MODEL_LEN,
max_num_batched_tokens=MAX_MODEL_LEN,
enable_cpu_offload=True,
kvcache_block_size=1024,
num_gpu_blocks=NUM_GPU_BLOCKS,
)